In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [3]:
df_full = pd.read_csv('flights.csv')

  df_full = pd.read_csv('flights.csv')


In [4]:
df_full['ARRIVAL_DELAY'].fillna(0, inplace=True)
df_full['IS_DISRUPTED'] = ((df_full['CANCELLED'] == 1) | (df_full['ARRIVAL_DELAY'] > 15)).astype('int8')

df_full['DEPARTURE_HOUR'] = (df_full['SCHEDULED_DEPARTURE'] // 100).astype('int8')

df_full['AIRPORT_FLIGHTS_PER_DAY'] = df_full.groupby(['ORIGIN_AIRPORT', 'MONTH', 'DAY'])['FLIGHT_NUMBER'].transform('count')



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_full['ARRIVAL_DELAY'].fillna(0, inplace=True)


In [5]:
sample_size = 150000
df_sample = df_full.sample(n=sample_size, random_state=42)
print(f"Using a sample of {sample_size:,} rows for tuning.")


Using a sample of 150,000 rows for tuning.


In [6]:
train_df, test_df_for_safety = train_test_split(df_sample, test_size=0.2, random_state=42) # we only need train_df


In [7]:
origin_airport_disruption_rate = train_df.groupby('ORIGIN_AIRPORT')['IS_DISRUPTED'].mean().to_dict()
global_disruption_rate = train_df['IS_DISRUPTED'].mean()

train_df['ORIGIN_AIRPORT_DISRUPTION_RATE'] = train_df['ORIGIN_AIRPORT'].map(origin_airport_disruption_rate)
train_df['ORIGIN_AIRPORT_DISRUPTION_RATE'].fillna(global_disruption_rate, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['ORIGIN_AIRPORT_DISRUPTION_RATE'].fillna(global_disruption_rate, inplace=True)


In [9]:
features = [
    'MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'SCHEDULED_TIME', 'DEPARTURE_HOUR',
    'AIRPORT_FLIGHTS_PER_DAY', 'ORIGIN_AIRPORT_DISRUPTION_RATE'
]
target = 'IS_DISRUPTED'

X_train = train_df[features]
y_train = train_df[target]
X_train.dropna(inplace=True)
y_train[X_train.index]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.dropna(inplace=True)


3308391    0
1708562    0
1098229    0
522689     0
3499749    0
          ..
1893077    0
2216506    0
2421037    0
2542290    0
2583367    1
Name: IS_DISRUPTED, Length: 120000, dtype: int8

In [10]:

param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 25, 30,35,40],
    'min_samples_leaf': [2, 4, 6, 8],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring='f1_weighted', 
    verbose=2
)

start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()

print(f"\nGridSearchCV finished in {(end_time - start_time)/60:.2f} minutes.")


print("\n## Best Hyperparameters Found ##")
print(grid_search.best_params_)

print("\nBest F1-Weighted Score on Cross-Validation:")
print(f"{grid_search.best_score_:.4f}")

Fitting 3 folds for each of 96 candidates, totalling 288 fits

GridSearchCV finished in 30.46 minutes.

## Best Hyperparameters Found ##
{'max_depth': 35, 'min_samples_leaf': 4, 'n_estimators': 300}

Best F1-Weighted Score on Cross-Validation:
0.7532


In [11]:
train_df, test_df = train_test_split(df_full, test_size=0.2, random_state=42)

origin_airport_disruption_rate = train_df.groupby('ORIGIN_AIRPORT')['IS_DISRUPTED'].mean().to_dict()
global_disruption_rate = train_df['IS_DISRUPTED'].mean()

train_df['ORIGIN_AIRPORT_DISRUPTION_RATE'] = train_df['ORIGIN_AIRPORT'].map(origin_airport_disruption_rate).fillna(global_disruption_rate)
test_df['ORIGIN_AIRPORT_DISRUPTION_RATE'] = test_df['ORIGIN_AIRPORT'].map(origin_airport_disruption_rate).fillna(global_disruption_rate)


In [12]:
features = [
    'MONTH', 'DAY_OF_WEEK', 'DISTANCE', 'SCHEDULED_TIME', 'DEPARTURE_HOUR',
    'AIRPORT_FLIGHTS_PER_DAY', 'ORIGIN_AIRPORT_DISRUPTION_RATE'
]
target = 'IS_DISRUPTED'

X_train = train_df[features]; 
y_train = train_df[target]
X_test = test_df[features]; 
y_test = test_df[target]

X_train = X_train.dropna()
X_test = X_test.dropna()
y_train = y_train[X_train.index]; y_test = y_test[X_test.index]



In [13]:


final_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=35,
    min_samples_leaf=4,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

start_time = time.time()
final_model.fit(X_train, y_train)
end_time = time.time()
print(f"Final model training finished in {(end_time - start_time)/60:.2f} minutes.")



Final model training finished in 5.64 minutes.


In [14]:
y_pred = final_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['On-Time', 'Disrupted']))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

     On-Time       0.87      0.83      0.85    941251
   Disrupted       0.39      0.45      0.42    222565

    accuracy                           0.76   1163816
   macro avg       0.63      0.64      0.63   1163816
weighted avg       0.77      0.76      0.77   1163816

[[783779 157472]
 [121963 100602]]
