# Librairies

In [26]:
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

# Data 

In [27]:
data = pd.read_parquet('../data/data_cleaned.parquet')
data

Unnamed: 0,ARR_DELAY_NEW,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,UNIQUE_CARRIER,ORIGIN_AIRPORT_ID,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,DEST_AIRPORT_ID,DEST_CITY_NAME,DEST_STATE_ABR,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE
0,0.0,2016,1,1,6,3,AA,11298.0,"Dallas/Fort Worth, TX",TX,11433.0,"Detroit, MI",MI,1100.0,1438.0,986.0
1,0.0,2016,1,1,7,4,AA,11298.0,"Dallas/Fort Worth, TX",TX,11433.0,"Detroit, MI",MI,1100.0,1438.0,986.0
2,7.0,2016,1,1,8,5,AA,11298.0,"Dallas/Fort Worth, TX",TX,11433.0,"Detroit, MI",MI,1100.0,1438.0,986.0
3,0.0,2016,1,1,9,6,AA,11298.0,"Dallas/Fort Worth, TX",TX,11433.0,"Detroit, MI",MI,1100.0,1438.0,986.0
4,113.0,2016,1,1,10,7,AA,11298.0,"Dallas/Fort Worth, TX",TX,11433.0,"Detroit, MI",MI,1100.0,1438.0,986.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5556190,0.0,2016,4,12,31,6,WN,15376.0,"Tucson, AZ",AZ,12892.0,"Los Angeles, CA",CA,755.0,830.0,451.0
5556191,9.0,2016,4,12,31,6,WN,15376.0,"Tucson, AZ",AZ,12892.0,"Los Angeles, CA",CA,1320.0,1355.0,451.0
5556192,0.0,2016,4,12,31,6,WN,15376.0,"Tucson, AZ",AZ,13232.0,"Chicago, IL",IL,705.0,1125.0,1440.0
5556193,0.0,2016,4,12,31,6,WN,15376.0,"Tucson, AZ",AZ,14679.0,"San Diego, CA",CA,1220.0,1235.0,368.0


# Pipeline

In [28]:
data['ARR_DELAY_NEW'] = data['ARR_DELAY_NEW'].apply(lambda x: 1 if x != 0 else 0)

X = data[['DAY_OF_MONTH', 'DAY_OF_WEEK', 'CRS_DEP_TIME', 'DEST_AIRPORT_ID', 'CRS_ARR_TIME', 'ORIGIN_AIRPORT_ID']]
y = data['ARR_DELAY_NEW']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Dummy

In [29]:
dummy_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier(strategy='most_frequent'))
])

In [30]:
dummy_pipeline.fit(X_train, y_train)
set_config(display='diagram')
dummy_pipeline

In [31]:
y_pred = dummy_pipeline.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cv_scores = cross_val_score(dummy_pipeline, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.66      1.00      0.80    733551
           1       0.00      0.00      0.00    377688

    accuracy                           0.66   1111239
   macro avg       0.33      0.50      0.40   1111239
weighted avg       0.44      0.66      0.52   1111239

Confusion Matrix:
 [[733551      0]
 [377688      0]]
Accuracy: 0.6601199201971854
Cross-validation scores: [0.65974736 0.65974736 0.65974736 0.65974646 0.65974646]
Mean cross-validation score: 0.6597470031199408


# Logistic regression

In [32]:
L_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('undersampler', RandomUnderSampler(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=40))
])

In [33]:
L_pipeline.fit(X_train, y_train)
set_config(display='diagram')
L_pipeline

In [34]:
y_pred = L_pipeline.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.56      0.63    733551
           1       0.40      0.57      0.47    377688

    accuracy                           0.56   1111239
   macro avg       0.56      0.56      0.55   1111239
weighted avg       0.61      0.56      0.57   1111239

Confusion Matrix:
 [[407701 325850]
 [160978 216710]]
Accuracy: 0.561905224708636


# Random forest

In [38]:
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('undersampler', RandomUnderSampler(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=150, max_depth=8, random_state=40))
])

In [39]:
pipeline.fit(X_train, y_train)
set_config(display='diagram')
pipeline

In [40]:
y_pred = pipeline.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.54      0.62    733551
           1       0.41      0.62      0.49    377688

    accuracy                           0.57   1111239
   macro avg       0.57      0.58      0.56   1111239
weighted avg       0.62      0.57      0.58   1111239

Confusion Matrix:
 [[397320 336231]
 [143275 234413]]
Accuracy: 0.5684942663099477


# Raison Choix

In [None]:
Il est ici légitime de se poser la question du choix du modèle.
Nous avons décidé de préviligier Random Forest pour deux raisons.

 - Le recall en classe 1 est supérieur. Cette métrique permet de minimiser le risque de rater un vrai retards.
 - Le taux de Faux Positif est inférieur. Cela est important pour le confort des passagers et leurs confiance
dans le système.

# Joblib

In [47]:
import joblib

joblib.dump(pipeline, '../app/model.joblib')

['../app/model.joblib']