In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from modules.model_optimizer import create_ensemble, HyperparameterOptimizer
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from modules.smoothed_target_encoder import SmoothedTargetEncoder
from modules.cyclical_encoder import CyclicalEncoder
from modules.flight_preprocessor import FlightPreprocessor
from sklearn.model_selection import train_test_split

In [None]:
df_flights = pd.read_csv("./data/flights.csv", low_memory=False)

# Set the columns to lower case
df_flights.columns = df_flights.columns.str.lower()
#df_flights = df_flights.fillna(0)
df_flights['is_delayed'] = (df_flights['arrival_delay'] > 15).astype(int)

In [None]:
df_airports = pd.read_csv("./data/airports.csv", low_memory=False)
df_airports.columns = df_airports.columns.str.lower()
df_airports = df_airports[['iata_code', 'state']]

In [None]:
processor = FlightPreprocessor()
df_flights = processor.preprocess(df_flights)

In [None]:
df_flights = df_flights.merge(
   df_airports,
   how='left',
   left_on='origin_airport',
   right_on='iata_code',
   suffixes=('', '_origin')
)


df_flights = df_flights.rename(columns={'state': 'state_origin'})
df_flights = df_flights.drop(columns=['iata_code'])


df_flights = df_flights.merge(
   df_airports,
   how='left',
   left_on='destination_airport',
   right_on='iata_code',
   suffixes=('', '_dest')
)


df_flights = df_flights.rename(columns={'state': 'state_dest'})
df_flights = df_flights.drop(columns=['iata_code'])

In [None]:
#variable that we know before the flight
known_variables = [
   'airline', 'origin_airport', 'destination_airport',
   'state_origin', 'state_dest',
   'distance',
   'scheduled_departure', 'scheduled_arrival',
   'day_of_week', 'month', 'year',
   'is_delayed'
]
df_features = df_flights[known_variables].copy()

In [None]:
df_features.head()

In [None]:
enc_distance = SmoothedTargetEncoder('distance', 'is_delayed')
enc_origin = SmoothedTargetEncoder('origin_airport', 'is_delayed')
enc_destination = SmoothedTargetEncoder('destination_airport', 'is_delayed')
enc_airline = SmoothedTargetEncoder('airline', 'is_delayed')
enc_state_origin = SmoothedTargetEncoder('state_origin', 'is_delayed')
enc_state_dest = SmoothedTargetEncoder('state_dest', 'is_delayed')

df_features['distance'] = enc_distance.fit_transform(df_features)
df_features['origin_airport_encoded'] = enc_origin.fit_transform(df_features)
df_features['destination_airport_encoded'] = enc_destination.fit_transform(df_features)
df_features['airline_encoded'] = enc_airline.fit_transform(df_features)
df_features['state_origin'] = enc_state_origin.fit_transform(df_features)
df_features['state_dest'] = enc_state_dest.fit_transform(df_features)

In [None]:
df_features['scheduled_departure'] = pd.to_datetime(df_features['scheduled_departure'])
df_features['scheduled_arrival']   = pd.to_datetime(df_features['scheduled_arrival'])

In [None]:
df_features['scheduled_departure'] = pd.to_datetime(df_features['scheduled_departure'])
df_features['scheduled_arrival']   = pd.to_datetime(df_features['scheduled_arrival'])

# departure
df_features['dep_hour'] = (
   df_features['scheduled_departure'].dt.hour +
   df_features['scheduled_departure'].dt.minute / 60
)
df_features['dep_dayofyear'] = df_features['scheduled_departure'].dt.dayofyear

#arrival
df_features['arr_hour'] = (
   df_features['scheduled_arrival'].dt.hour +
   df_features['scheduled_arrival'].dt.minute / 60
)
df_features['arr_dayofyear'] = df_features['scheduled_arrival'].dt.dayofyear

dep_enc_day = CyclicalEncoder('dep_dayofyear', period=365)
dep_enc_hour = CyclicalEncoder('dep_hour', period=24)
arr_enc_day = CyclicalEncoder('arr_dayofyear', period=365)
arr_enc_hour = CyclicalEncoder('arr_hour', period=24)

df_features = dep_enc_day.fit_transform(df_features)
df_features = dep_enc_hour.fit_transform(df_features)
df_features = arr_enc_day.fit_transform(df_features)
df_features = arr_enc_hour.fit_transform(df_features)

In [None]:
df_features.columns

In [None]:
final_features = [
   'state_origin', 'state_dest', 'distance',
   'origin_airport_encoded', 'destination_airport_encoded', 'airline_encoded',
   'dep_dayofyear_sin', 'dep_dayofyear_cos',
   'dep_hour_sin', 'dep_hour_cos',
   'arr_dayofyear_sin', 'arr_dayofyear_cos',
   'arr_hour_sin', 'arr_hour_cos',
   'is_delayed'
]

df_processed = df_features[final_features].copy()

In [None]:
df_processed.head()

In [None]:
df_processed['is_delayed'].value_counts(normalize=True)

In [None]:
X = df_processed.drop(columns=['is_delayed'])
y = df_processed['is_delayed']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Ensemble simples (XGBoost + LightGBM + CatBoost)
ensemble_model = create_ensemble(X_train, y_train, random_state=42)

In [None]:
# Treinar o ensemble
ensemble_model.fit(X_train, y_train)

In [None]:
# Predict com ensemble
y_pred_proba = ensemble_model.predict_proba(X_test)[:, 1]
y_pred = ensemble_model.predict(X_test)

# Métricas
accuracy = accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba)
cm = confusion_matrix(y_test, y_pred)

print("\n### Model Evaluation Results (Test Set) ###")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {auc_score:.4f}")
print("\nConfusion Matrix:")
print(cm)

In [None]:
# Otimizar hiperparâmetros (demora ~10-15 min)
optimizer = HyperparameterOptimizer(random_state=42)
best_params = optimizer.optimize_xgboost(X_train, y_train, n_trials=30, cv_folds=3)


# Obter modelo otimizado
models = optimizer.get_optimized_models(X_train, y_train)
xgb_optimized = models['xgboost']


# Treinar com validação
X_tr, X_val, y_tr, y_val = train_test_split(
   X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)


xgb_optimized.fit(
   X_tr, y_tr,
   eval_set=[(X_tr, y_tr), (X_val, y_val)],
   verbose=50
)

In [None]:
# 1. Recupera o histórico de resultados do modelo treinado
results = xgb_optimized.evals_result()

# 2. Define o eixo X (número de iterações/árvores)
epochs = len(results['validation_0']['auc'])
x_axis = range(0, epochs)

# 3. Plota o gráfico
plt.figure(figsize=(10, 6))
plt.plot(x_axis, results['validation_0']['auc'], label='Validação (Teste)')
plt.plot(x_axis, results['validation_1']['auc'], label='Validação')
plt.legend()
plt.ylabel('AUC')
plt.xlabel('Número de Iterações (n_estimators)')
plt.title('Performance do XGBoost: Curva de Aprendizado')
plt.grid(True)
plt.show()

In [None]:
# Otimizar hiperparâmetros (demora ~10-15 min)
optimizer = HyperparameterOptimizer(random_state=42)
best_params = optimizer.optimize_lightgbm(X_train, y_train, n_trials=30, cv_folds=3)

# Obter modelo otimizado
models = optimizer.get_optimized_models(X_train, y_train)
lightgbm_optimized = models['lightgbm']

# Treinar com validação
#X_tr, X_val, y_tr, y_val = train_test_split(
#    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
#)


lightgbm_optimized.fit(
    X_tr, y_tr,
    eval_set=[(X_tr, y_tr), (X_val, y_val)],
    verbose=50
)