# ðŸ“˜ ENTRENAMIENTO DEL MODELO (RegresiÃ³n LogÃ­stica)

## Objetivo: Entrenar un modelo de RegresiÃ³n LogÃ­stica que prediga si un vuelo serÃ¡ Retrasado o Puntual, y guardar un pipeline completo listo para despliegue.

### ðŸ“Œ ImportaciÃ³n de librerÃ­as

In [None]:
import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import joblib

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

### ðŸ“Œ Carga del dataset

In [None]:
# Cargar dataset previamente procesado por ETL
df = pd.read_csv("vuelos_etl_limpio.csv")

print("Dataset cargado correctamente")
df.head()

Dataset loaded successfully


### ðŸ“Œ Limpieza de datos y variable objetivo

In [None]:
# Convertir fecha a formato datetime
df["FL_DATE"] = pd.to_datetime(df["FL_DATE"], errors="coerce")
df = df.dropna()

# Definir variable objetivo:
# Un vuelo se considera retrasado si ARR_DELAY > 15 minutos
df["DELAYED"] = (df["ARR_DELAY"] > 15).astype(int)

### ðŸ“Œ ConversiÃ³n de horarios a minutos

In [None]:
def time_to_total_minutes(time_str):
    try:
        h, m, s = map(int, str(time_str).split(":"))
        return h * 60 + m
    except:
        return 0


time_cols = [
    "CRS_DEP_TIME",
    "CRS_ARR_TIME",
    "DEP_TIME",
    "ARR_TIME",
    "WHEELS_OFF",
    "WHEELS_ON"
]

for col in time_cols:
    if col in df.columns:
        df[col] = df[col].apply(time_to_total_minutes)

### ðŸ“Œ SelecciÃ³n de variables predictoras

In [None]:
features = [
    "AIRLINE_CODE",
    "ORIGIN",
    "DEST",
    "DISTANCE",
    "CRS_DEP_TIME",
    "CRS_ARR_TIME"
]

X = df[features]
y = df["DELAYED"]

### ðŸ“Œ DivisiÃ³n Train / Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

### ðŸ“Œ Definir features

In [None]:
numeric_features = [
    "DISTANCE",
    "CRS_DEP_TIME",
    "CRS_ARR_TIME"
]

categorical_features = [
    "AIRLINE_CODE",
    "ORIGIN",
    "DEST"
]

### ðŸ“Œ Pipeline de entrenamiento

In [None]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(
        drop="first",
        handle_unknown="ignore"
    ))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [None]:
pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs"
))
])

### ðŸ“Œ Entrenamiento

In [None]:
pipeline.fit(X_train, y_train)


### ðŸ“Œ EvaluaciÃ³n del modelo

In [None]:
y_pred = pipeline.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[284113 196841]
 [ 40879  62179]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.59      0.71    480954
           1       0.24      0.60      0.34    103058

    accuracy                           0.59    584012
   macro avg       0.56      0.60      0.52    584012
weighted avg       0.76      0.59      0.64    584012



### ðŸ“Œ EvaluaciÃ³n por umbral (threshold)

In [None]:
proba = pipeline.predict_proba(X_test)[:, 1]

for t in [0.2, 0.3, 0.4, 0.5]:
    y_pred_t = (proba >= t).astype(int)

    print(f"\n===== Threshold = {t} =====")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred_t))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_t))


===== Threshold = 0.2 =====
Confusion Matrix:
[[   222 480732]
 [    31 103027]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.00      0.00    480954
           1       0.18      1.00      0.30    103058

    accuracy                           0.18    584012
   macro avg       0.53      0.50      0.15    584012
weighted avg       0.75      0.18      0.05    584012


===== Threshold = 0.3 =====
Confusion Matrix:
[[ 26025 454929]
 [  2332 100726]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.05      0.10    480954
           1       0.18      0.98      0.31    103058

    accuracy                           0.22    584012
   macro avg       0.55      0.52      0.20    584012
weighted avg       0.79      0.22      0.14    584012


===== Threshold = 0.4 =====
Confusion Matrix:
[[139806 341148]
 [ 15599  87459]]

Classification Report:
              precision    r

### ðŸ“Œ Guardado del modelo para producciÃ³n

In [None]:
joblib.dump(
    pipeline,
    "DEPLOYABLE.pkl"
)

print("âœ… Deployable model saved successfully")

âœ… Deployable model saved successfully
