Imports

In [1]:
# Importaciones básicas
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Importaciones para procesamiento de datos
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Importaciones para modelado
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm

In [2]:
# Cargar datos
def load_data():
    df_2021 = pd.read_parquet("../data/raw/Combined_Flights_2021.parquet")
    df_2022 = pd.read_parquet("../data/raw/Combined_Flights_2022.parquet")
    return pd.concat([df_2021, df_2022], ignore_index=True)

# Características relevantes seleccionadas basadas en análisis de importancia
columnas_relevantes = [
    'DayOfWeek',
    'Month',
    'Quarter',
    'DayofMonth',
    'DepTime',
    'CRSDepTime',
    'Distance',
    'Airline',
    'OriginStateName',
    'DestStateName',
    'DepDelayMinutes'  # Variable objetivo
]

# Cargar y preparar datos
df = load_data()
df = df[columnas_relevantes]

In [3]:
def engineer_features(df):
    # Convertir hora militar a franja horaria
    df['Hour'] = df['DepTime'].apply(lambda x: int(str(int(x)).zfill(4)[:2]) if pd.notnull(x) else np.nan)
    
    # Crear variables para temporada
    df['IsSummer'] = df['Month'].isin([6, 7, 8]).astype(int)
    df['IsHolidaySeason'] = df['Month'].isin([11, 12, 1]).astype(int)
    
    # Crear variable para fin de semana
    df['IsWeekend'] = df['DayOfWeek'].isin([6, 7]).astype(int)
    
    # Crear variable para hora pico
    df['IsPeakHour'] = df['Hour'].isin([7, 8, 9, 16, 17, 18]).astype(int)
    
    # Diferencia entre tiempo programado y real
    df['TimeDiff'] = df['DepTime'] - df['CRSDepTime']
    
    return df

df = engineer_features(df)

In [4]:
def preprocess_data(df):
    """
    Realiza el preprocesamiento completo del dataset incluyendo:
    - Manejo de valores nulos
    - Codificación de variables categóricas
    - Eliminación de valores atípicos
    - Normalización de variables numéricas
    """
    # Crear copia para no modificar datos originales
    df_processed = df.copy()
    
    # Identificar tipos de columnas
    numeric_columns = df_processed.select_dtypes(include=['float64', 'int64']).columns
    categorical_columns = df_processed.select_dtypes(include=['object']).columns
    
    # Manejar valores nulos
    for col in numeric_columns:
        df_processed[col].fillna(df_processed[col].median(), inplace=True)
    for col in categorical_columns:
        df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)
    
    # Convertir variables categóricas a numéricas usando Label Encoding
    from sklearn.preprocessing import LabelEncoder
    label_encoders = {}
    
    for col in categorical_columns:
        label_encoders[col] = LabelEncoder()
        df_processed[col] = label_encoders[col].fit_transform(df_processed[col])
        # Ahora la columna es numérica, así que la añadimos a numeric_columns
        numeric_columns = numeric_columns.append(pd.Index([col]))
    
    # Eliminar valores atípicos usando el método IQR
    for col in numeric_columns:
        if col != 'DepDelayMinutes':  # No aplicar a nuestra variable objetivo
            Q1 = df_processed[col].quantile(0.25)
            Q3 = df_processed[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df_processed[col] = df_processed[col].clip(lower_bound, upper_bound)
    
    # Normalizar variables numéricas (excepto la variable objetivo)
    scaler = StandardScaler()
    columns_to_scale = [col for col in numeric_columns if col != 'DepDelayMinutes']
    df_processed[columns_to_scale] = scaler.fit_transform(df_processed[columns_to_scale])
    
    return df_processed, scaler, label_encoders

In [5]:
def create_train_test_sets(df, target_column='DepDelayMinutes', test_size=0.2, random_state=42):
    """
    Crea los conjuntos de entrenamiento y prueba
    """
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def create_and_train_model(X_train, X_test, y_train, y_test):
    """
    Crea y entrena un modelo XGBoost con configuración simplificada
    """
    # Crear modelo con parámetros básicos pero efectivos
    model = XGBRegressor(
        n_estimators=100,    # Número de árboles
        learning_rate=0.1,   # Tasa de aprendizaje
        max_depth=7,         # Profundidad máxima de los árboles
        random_state=42      # Para reproducibilidad
    )
    
    # Entrenar el modelo de manera simple
    print("Entrenando modelo...")
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=True
    )
    
    return model

In [6]:
def evaluate_model(y_true, y_pred):
    """
    Evalúa el modelo usando múltiples métricas
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    print(f"Métricas de evaluación:")
    print(f"MAE: {mae:.2f} minutos")
    print(f"RMSE: {rmse:.2f} minutos")
    print(f"R2 Score: {r2:.4f}")
    
    return mae, rmse, r2

In [7]:
def main():
    # Cargar y preparar datos
    print("Cargando datos...")
    df = load_data()
    df = df[columnas_relevantes]
    
    # Ingeniería de características
    print("Realizando ingeniería de características...")
    df = engineer_features(df)
    
    # Preprocesamiento
    print("Preprocesando datos...")
    df_processed, scaler, label_encoders = preprocess_data(df)
    
    # Dividir datos
    print("Dividiendo datos en conjuntos de entrenamiento y prueba...")
    X_train, X_test, y_train, y_test = create_train_test_sets(df_processed)
    
    # Crear y entrenar modelo
    model = create_and_train_model(X_train, X_test, y_train, y_test)
    
    # Evaluar modelo
    print("\nEvaluando modelo...")
    y_pred = model.predict(X_test)
    mae, rmse, r2 = evaluate_model(y_test, y_pred)
    
    # Guardar modelo, scaler y encoders
    print("\nGuardando modelo, scaler y encoders...")
    import joblib
    joblib.dump(model, 'flight_delay_model.joblib')
    joblib.dump(scaler, 'scaler.joblib')
    joblib.dump(label_encoders, 'label_encoders.joblib')
    
    return model, scaler, label_encoders, (mae, rmse, r2)

if __name__ == "__main__":
    model, scaler, label_encoders, metrics = main()

Cargando datos...
Realizando ingeniería de características...
Preprocesando datos...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[col].fillna(df_processed[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)


Dividiendo datos en conjuntos de entrenamiento y prueba...
Entrenando modelo...
[0]	validation_0-rmse:45.14716
[1]	validation_0-rmse:41.68352
[2]	validation_0-rmse:38.61930
[3]	validation_0-rmse:35.84564
[4]	validation_0-rmse:33.42481
[5]	validation_0-rmse:31.32903
[6]	validation_0-rmse:29.44595
[7]	validation_0-rmse:27.70006
[8]	validation_0-rmse:26.11173
[9]	validation_0-rmse:24.86114
[10]	validation_0-rmse:23.76210
[11]	validation_0-rmse:22.78010
[12]	validation_0-rmse:21.91668
[13]	validation_0-rmse:21.10108
[14]	validation_0-rmse:20.43498
[15]	validation_0-rmse:19.85699
[16]	validation_0-rmse:19.30225
[17]	validation_0-rmse:18.82917
[18]	validation_0-rmse:18.39866
[19]	validation_0-rmse:18.06969
[20]	validation_0-rmse:17.69065
[21]	validation_0-rmse:17.42378
[22]	validation_0-rmse:17.13934
[23]	validation_0-rmse:16.92487
[24]	validation_0-rmse:16.68883
[25]	validation_0-rmse:16.52932
[26]	validation_0-rmse:16.33646
[27]	validation_0-rmse:16.21725
[28]	validation_0-rmse:16.08163
[2

In [8]:
main()

Cargando datos...
Realizando ingeniería de características...
Preprocesando datos...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[col].fillna(df_processed[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)


Dividiendo datos en conjuntos de entrenamiento y prueba...
Entrenando modelo...
[0]	validation_0-rmse:45.14716
[1]	validation_0-rmse:41.68352
[2]	validation_0-rmse:38.61930
[3]	validation_0-rmse:35.84564
[4]	validation_0-rmse:33.42481
[5]	validation_0-rmse:31.32903
[6]	validation_0-rmse:29.44595
[7]	validation_0-rmse:27.70006
[8]	validation_0-rmse:26.11173
[9]	validation_0-rmse:24.86114
[10]	validation_0-rmse:23.76210
[11]	validation_0-rmse:22.78010
[12]	validation_0-rmse:21.91668
[13]	validation_0-rmse:21.10108
[14]	validation_0-rmse:20.43498
[15]	validation_0-rmse:19.85699
[16]	validation_0-rmse:19.30225
[17]	validation_0-rmse:18.82917
[18]	validation_0-rmse:18.39866
[19]	validation_0-rmse:18.06969
[20]	validation_0-rmse:17.69065
[21]	validation_0-rmse:17.42378
[22]	validation_0-rmse:17.13934
[23]	validation_0-rmse:16.92487
[24]	validation_0-rmse:16.68883
[25]	validation_0-rmse:16.52932
[26]	validation_0-rmse:16.33646
[27]	validation_0-rmse:16.21725
[28]	validation_0-rmse:16.08163
[2

(XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...),
 StandardScaler(),
 {'Airline': LabelEncoder(),
  'OriginStateName': LabelEncoder(),
  'DestStateName': LabelEncoder()},
 (0.8830630836837436, 12.147079639045309, 0.9386346413034659))