In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [None]:
df = pd.read_csv('flights.csv')
print("Data loaded successfully.")


  df = pd.read_csv('flights.csv')


Data loaded successfully.


In [3]:

df['ARRIVAL_DELAY'].fillna(0, inplace=True)

df['IS_DISRUPTED'] = ((df['CANCELLED'] == 1) | (df['ARRIVAL_DELAY'] > 15)).astype('int8')


df['DEPARTURE_HOUR'] = (df['SCHEDULED_DEPARTURE'] // 100).astype('int8')

features = ['MONTH', 'DAY_OF_WEEK', 'AIRLINE', 'DISTANCE', 'SCHEDULED_TIME', 'DEPARTURE_HOUR']
target = 'IS_DISRUPTED'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ARRIVAL_DELAY'].fillna(0, inplace=True)


In [None]:

df_model = df[features + [target]].copy()
df_model = pd.get_dummies(df_model, columns=['AIRLINE'], drop_first=True, dtype='int8')
df_model.dropna(inplace=True)

X = df_model.drop(columns=target)
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [5]:

print("Training the Random Forest model...")
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf_classifier.fit(X_train, y_train)
print("Model training complete.")


Training the Random Forest model...
Model training complete.


In [6]:

print("Evaluating the model...")
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2%}")

print("\n## Classification Report ##")
target_names = ['On-Time', 'Disrupted']
print(classification_report(y_test, y_pred, target_names=target_names))

print("\n## Confusion Matrix ##")

print(confusion_matrix(y_test, y_pred))
print("\nScript finished.")

Evaluating the model...

Model Accuracy: 69.38%

## Classification Report ##
              precision    recall  f1-score   support

     On-Time       0.83      0.77      0.80    941140
   Disrupted       0.27      0.35      0.31    222675

    accuracy                           0.69   1163815
   macro avg       0.55      0.56      0.55   1163815
weighted avg       0.73      0.69      0.71   1163815


## Confusion Matrix ##
[[728943 212197]
 [144193  78482]]

Script finished.
