# Librairies

In [1]:
pip install pandas pyarrow

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn import set_config
from sklearn import tree
from sklearn.metrics import f1_score, recall_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import joblib

# Data 

In [7]:
data = pd.read_parquet('data_cleaned.parquet')

In [14]:
data_subset = data.head(50000)

In [15]:
data_subset.to_parquet('subset_50000.parquet', index=False)

# Pipeline

In [16]:
data['ARR_DELAY_NEW'] = data['ARR_DELAY_NEW'].apply(lambda x: 1 if x != 0 else 0)

X = data.drop(columns='ARR_DELAY_NEW')
y = data['ARR_DELAY_NEW']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Dummy

In [17]:
dummy_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier(strategy='most_frequent'))
])

In [18]:
dummy_pipeline.fit(X_train, y_train)

In [None]:
set_config(display='diagram')

dummy_pipeline

In [9]:
y_pred = dummy_pipeline.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cv_scores = cross_val_score(dummy_pipeline, X, y, cv=5) 
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.66      1.00      0.80    733551
           1       0.00      0.00      0.00    377688

    accuracy                           0.66   1111239
   macro avg       0.33      0.50      0.40   1111239
weighted avg       0.44      0.66      0.52   1111239

Confusion Matrix:
 [[733551      0]
 [377688      0]]
Accuracy: 0.6601199201971854
Cross-validation scores: [0.65974736 0.65974736 0.65974736 0.65974646 0.65974646]
Mean cross-validation score: 0.6597470031199408


# Random forest

In [10]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=150, max_depth=8, random_state=40))
])

In [11]:
pipeline.fit(X_train, y_train)
set_config(display='diagram')
pipeline

KeyboardInterrupt: 

In [12]:
y_pred = pipeline.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

KeyboardInterrupt: 

# Joblib

In [13]:
joblib.dump(pipeline, 'RF_model.joblib')

['RF_model.joblib']