### 0. Imports necesarios

In [19]:
import bootcampviztools as bt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, classification_report

import lightgbm as lgb
import os
import pickle


### 1. Carga de datos de Train

In [3]:
df_train = pd.read_csv('../data/train.csv', index_col=0)

# Ponemos 'id' como índice
df_train = df_train.set_index('id')

### 2. Pipelines

#### 2.1 Pipeline para limpiar nulos y duplicados

In [4]:
# Función para eliminar columnas con más del 50% de nulos
def drop_high_null_columns(df, threshold=50):
    null_percentage = df.isnull().mean() * 100
    cols_to_drop = null_percentage[null_percentage > threshold].index
    return df.drop(columns=cols_to_drop)

# Función para eliminar filas con nulos restantes
def drop_remaining_nulls(df):
    return df.dropna()

# Función para eliminar duplicados
def drop_duplicates(df):
    return df.drop_duplicates()

data_pipeline = Pipeline([
    ('drop_high_nulls', FunctionTransformer(drop_high_null_columns)),  # Elimina columnas con muchos nulos
    ('drop_remaining_nulls', FunctionTransformer(drop_remaining_nulls)),  # Elimina filas con nulos
    ('drop_duplicates', FunctionTransformer(drop_duplicates))  # Elimina duplicados
])

df_train_cleaned = data_pipeline.fit_transform(df_train)
df_train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103594 entries, 70172 to 62567
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103594 non-null  object 
 1   Customer Type                      103594 non-null  object 
 2   Age                                103594 non-null  int64  
 3   Type of Travel                     103594 non-null  object 
 4   Class                              103594 non-null  object 
 5   Flight Distance                    103594 non-null  int64  
 6   Inflight wifi service              103594 non-null  int64  
 7   Departure/Arrival time convenient  103594 non-null  int64  
 8   Ease of Online booking             103594 non-null  int64  
 9   Gate location                      103594 non-null  int64  
 10  Food and drink                     103594 non-null  int64  
 11  Online boarding                    103594

In [5]:
#Identificar tipos de columnas
numerical_cols = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
binary_cols = [col for col in df_train_cleaned.select_dtypes(include=['object']).columns if df_train_cleaned[col].nunique() == 2]

# Quitar el target
binary_cols.remove('satisfaction')

# Establecer el target
target = 'satisfaction'


onehot_cols = [col for col in df_train_cleaned.columns if col not in numerical_cols and col not in binary_cols and col not in target]


#features numericas a las que aplicar logaritmo:
numerical_cols_log = [col for col in numerical_cols if col != 'Age']


#### 2.2 Pipeline para transformar las columnas numéricas con logaritmo

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

# Transformador personalizado para aplicar logaritmo a variables numéricas positivas
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self  # No necesita ajuste

    def transform(self, X):
        X_transformed = X.copy()
        # Aplicar logaritmo solo a las columnas especificadas
        X_transformed[self.cols] = np.log1p(X_transformed[self.cols])  # np.log1p(x) = log(x + 1) para evitar log(0)
        return X_transformed


# Crear el pipeline con el transformador de logaritmo
log_pipeline = Pipeline([
    ('log_transform', LogTransformer(numerical_cols_log))
])


df_train_cleaned = log_pipeline.fit_transform(df_train_cleaned)

In [7]:
# Añadimos la columna Age a la que no había que aplicar logaritmo.
numerical_cols = numerical_cols_log + ['Age']

### 2.3 Pipeline para transformar las columnas con one hot encoding y estandarizar

In [9]:
#Función para aplicar Label Encoding a variables binarias SIN cambiar nombres
def label_encode_binary(X):
    X_encoded = X.copy()
    for col in X_encoded.columns:
        X_encoded[col] = X_encoded[col].astype("category").cat.codes
    return pd.DataFrame(X_encoded, columns=X.columns)  # Mantiene los nombres originales

# Función para binarizar la variable 'satisfaction'
def binarize_satisfaction(X):
    return X.replace({"neutral or dissatisfied": 0, "satisfied": 1})

# Definir transformaciones
preprocessor = ColumnTransformer([
    ("target", FunctionTransformer(binarize_satisfaction, feature_names_out="one-to-one"), ["satisfaction"]),  # Binarización del target
    ("binary", FunctionTransformer(label_encode_binary, feature_names_out="one-to-one"), binary_cols),  # Label Encoding para binarias
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False), onehot_cols),  # OneHotEncoder para categóricas con más de 2 clases
    ("scaler", StandardScaler(), numerical_cols)  # Escalado de numéricas
], remainder='passthrough')  # Mantiene columnas que no requieren transformación

# Crear pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor)
])

# Ajustar el pipeline en el dataset de entrenamiento
pipeline.fit(df_train_cleaned)

# Transformar ambos conjuntos de datos (sin volver a ajustar)
array_transformed = pipeline.transform(df_train_cleaned)

# Obtener nombres de columnas DESPUÉS de entrenar en train
feature_names = preprocessor.get_feature_names_out()

# Convertir a DataFrame con nombres de columnas correctos
df_transformed = pd.DataFrame(array_transformed, columns=feature_names)

  return X.replace({"neutral or dissatisfied": 0, "satisfied": 1})
  return X.replace({"neutral or dissatisfied": 0, "satisfied": 1})


In [10]:
# Obtener nombres originales y nuevos nombres
target = ['satisfaction']
binary_names = pipeline.named_steps['preprocessor'].named_transformers_['binary'].get_feature_names_out(binary_cols).tolist()  
onehot_names = pipeline.named_steps['preprocessor'].named_transformers_['onehot'].get_feature_names_out(onehot_cols).tolist()
numerical_names = pipeline.named_steps['preprocessor'].named_transformers_['scaler'].get_feature_names_out(numerical_cols).tolist()  


# Crear nuevo DataFrame con nombres originales o adaptados
new_col_names = target + binary_names + onehot_names + numerical_names 
df_transformed = pd.DataFrame(array_transformed, columns=new_col_names)

df_transformed.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Type of Travel,Class_Business,Class_Eco,Class_Eco Plus,Inflight wifi service_0,Inflight wifi service_1,Inflight wifi service_2,...,Cleanliness_0,Cleanliness_1,Cleanliness_2,Cleanliness_3,Cleanliness_4,Cleanliness_5,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,Age
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,-0.624644,1.251013,1.02968,-1.745542
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-1.355868,-0.333781,0.417532,-0.951526
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.366986,-0.762053,-0.77541,-0.885358
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.406356,0.773286,0.636191,-0.951526
4,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.457644,-0.762053,-0.77541,1.430521


### 3. Separara en x e y 

In [15]:
train_set, val_set = train_test_split(df_transformed, test_size=0.2, random_state=42)

In [16]:
# Dividir datos en x e y
X_train = train_set.drop('satisfaction', axis=1)
y_train = train_set["satisfaction"]
X_test = val_set.drop(target, axis = 1)
y_test = val_set["satisfaction"]

In [17]:
# Definir modelos base sin hiperparámetros ajustados
models = {
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "LogisticRegression": LogisticRegression(),
    "LightGBM": lgb.LGBMClassifier()
}


# Evaluar los modelos base con Balanced Accuracy
scores = {}

for model_name, model in models.items():
    print(f"Entrenando {model_name}...")

    # Crear pipeline para cada modelo sin preprocesamiento, ya que X_train y X_test ya están transformados
    pipeline_model = Pipeline([
        ("model", model)
    ])

    # Entrenar el modelo
    pipeline_model.fit(X_train, y_train)

    # Predecir en train
    y_pred = pipeline_model.predict(X_test)

    # Evaluar usando balanced_accuracy
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    scores[model_name] = balanced_acc

    print(f"{model_name} - Balanced Accuracy: {balanced_acc:.4f}")

# Mostrar los resultados finales ordenados
print("\nResultados finales de los modelos:")
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

for model, score in sorted_scores:
    print(f"{model}: {score:.4f}")

Entrenando RandomForest...
RandomForest - Balanced Accuracy: 0.9569
Entrenando GradientBoosting...
GradientBoosting - Balanced Accuracy: 0.9421
Entrenando LogisticRegression...
LogisticRegression - Balanced Accuracy: 0.9315
Entrenando LightGBM...
[LightGBM] [Info] Number of positive: 35833, number of negative: 47042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020764 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1005
[LightGBM] [Info] Number of data points in the train set: 82875, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.432374 -> initscore=-0.272172
[LightGBM] [Info] Start training from score -0.272172
LightGBM - Balanced Accuracy: 0.9628

Resultados finales de los modelos:
LightGBM: 0.9628
RandomForest: 0.9569
GradientBoosting: 0.9421
LogisticRegression: 0.9315


In [24]:
# Obtener la ruta absoluta del directorio raíz del proyecto
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))  # Subir 2 niveles desde result_notebooks

# Definir el directorio del modelo dentro del `src/models` correcto
MODEL_DIR = os.path.join(ROOT_DIR, "src", "models")
os.makedirs(MODEL_DIR, exist_ok=True)  # Crear carpeta si no existe


# Seleccionar el mejor modelo basado en balanced_accuracy
best_model_name = max(scores, key=scores.get)  # Modelo con mayor balanced_accuracy
print(f"\n Modelo seleccionado para optimización: {best_model_name}")

param_grid = {
    "RandomForest": {"model__n_estimators": [50, 100, 200], "model__max_depth": [None, 10, 20]},
    "GradientBoosting": {"model__n_estimators": [50, 100], "model__learning_rate": [0.01, 0.1]},
    "LightGBM": {"model__num_leaves": [31, 50], "model__learning_rate": [0.01, 0.1]}
}

# Optimizar hiperparámetros en el mejor modelo
print(f"\n Buscando mejores hiperparámetros para {best_model_name}...")

pipeline = Pipeline([
    ("model", models[best_model_name])  # Solo el mejor modelo
])

grid_search = GridSearchCV(pipeline, param_grid[best_model_name], cv=5, scoring="balanced_accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)


 Modelo seleccionado para optimización: LightGBM

 Buscando mejores hiperparámetros para LightGBM...
[LightGBM] [Info] Number of positive: 35833, number of negative: 47042
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1005
[LightGBM] [Info] Number of data points in the train set: 82875, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.432374 -> initscore=-0.272172
[LightGBM] [Info] Start training from score -0.272172


In [25]:
# Guardar el mejor modelo entrenado
best_model_path = os.path.join(MODEL_DIR, f"{best_model_name}_optimized_pipeline.pkl")
with open(best_model_path, "wb") as f:
    pickle.dump(grid_search.best_estimator_, f)

print(f"Mejor modelo guardado en {best_model_path}")

Mejor modelo guardado en c:\Users\maria\Documents\GitHub\Pipelines_Airline_Passenger_Satisfaction\src\models\LightGBM_optimized_pipeline.pkl
