In [27]:
#Import base packages for analysis

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [28]:
#Import 2015 flight data from CSV file

dtype_options = {'AIRLINE': 'object', 'TAIL_NUMBER': 'object', 'ORIGIN_AIRPORT': 'object', 'DESTINATION_AIRPORT': 'object'}
flights = pd.read_csv('Data/flights.csv', dtype = dtype_options)

In [29]:
#Check shape of flight file

flights.shape

(5819079, 31)

In [30]:
#Check head of file to ensure dates begin on January 01, 2015

flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [31]:
#Check tail of file to ensure dates end on December 31, 2015

flights.tail()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
5819074,2015,12,31,4,B6,688,N657JB,LAX,BOS,2359,...,753.0,-26.0,0,0,,,,,,
5819075,2015,12,31,4,B6,745,N828JB,JFK,PSE,2359,...,430.0,-16.0,0,0,,,,,,
5819076,2015,12,31,4,B6,1503,N913JB,JFK,SJU,2359,...,432.0,-8.0,0,0,,,,,,
5819077,2015,12,31,4,B6,333,N527JB,MCO,SJU,2359,...,330.0,-10.0,0,0,,,,,,
5819078,2015,12,31,4,B6,839,N534JB,JFK,BQN,2359,...,442.0,2.0,0,0,,,,,,


In [32]:
#Create sample size for flights DataFrame to speed up model testing and evaluation

flights = flights.sample(frac = 0.1, random_state = 42)

In [33]:
#Confirm new model size has ~10% rows of main dataset

flights.shape

(581908, 31)

In [34]:
#One-hot encode categorical variables airlines, origin airport, and destination airport

flights2 = pd.get_dummies(flights, columns=['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'])

In [35]:
#Check shape of new DataFrame to confirm one-hot encoding was successfully executed

flights2.shape

(581908, 1292)

In [36]:
#Check head of file to confirm one-hot encoding for categorical variables to be used in the analysis

flights2.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,TAIL_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,...,DESTINATION_AIRPORT_TYS,DESTINATION_AIRPORT_UST,DESTINATION_AIRPORT_VEL,DESTINATION_AIRPORT_VLD,DESTINATION_AIRPORT_VPS,DESTINATION_AIRPORT_WRG,DESTINATION_AIRPORT_WYS,DESTINATION_AIRPORT_XNA,DESTINATION_AIRPORT_YAK,DESTINATION_AIRPORT_YUM
1508570,2015,4,7,2,4900,N759EV,1340,1335.0,-5.0,13.0,...,False,False,False,False,False,False,False,False,False,False
363270,2015,1,24,6,611,N413AS,1910,1858.0,-12.0,14.0,...,False,False,False,False,False,False,False,False,False,False
3003945,2015,7,8,3,1483,N463WN,630,626.0,-4.0,8.0,...,False,False,False,False,False,False,False,False,False,False
2291425,2015,5,26,2,193,N7745A,810,1043.0,153.0,6.0,...,False,False,False,False,False,False,False,False,False,False
2973084,2015,7,6,1,253,N213UA,1000,1008.0,8.0,16.0,...,False,False,False,False,False,False,False,False,False,False


In [37]:
#Drop irrelevant columns from analysis that do not intuitively impact whether or not a flight will be on time or late 

columns_to_drop = ['YEAR', 'DAY', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'TAXI_IN', 'ARRIVAL_TIME', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DEPARTURE_TIME']
flights2.drop(columns = columns_to_drop, inplace = True)

In [38]:
#Clean data by dropping all rows with empty data

flights2.dropna(inplace = True)

In [39]:
#Further cleaning by casting the entire DataFrame as an integer type

flights3 = flights2.astype(int)

In [40]:
#Remove "Arrival Delay" column as this will be our dependent variable

flights_x = flights3.drop(columns = ['ARRIVAL_DELAY'])

In [43]:
#Create new DataFrame for dependent variable

flights_y = flights3['ARRIVAL_DELAY']
flights_y = pd.DataFrame(flights_y)

In [44]:
#Classify flights as late if their arrival delay time is > 0 minutes (Value = 1) otherwise if on-time or early (Value = 0)

flights_y['IS_LATE'] = (flights_y['ARRIVAL_DELAY'] > 0).astype(int)

In [47]:
#Check head of file to confirm flights with negative arrival delays are classified as 0 and flights with positive values are classified as 1

flights_y.head()

Unnamed: 0,ARRIVAL_DELAY,IS_LATE
1508570,-13,0
363270,-12,0
3003945,-8,0
2291425,152,1
2973084,-2,0


In [48]:
#Drop "Arrival Delay" column from analysis

flights_y = flights_y['IS_LATE']

In [21]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.66
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.92      0.77     72612
           1       0.59      0.20      0.30     41659

    accuracy                           0.66    114271
   macro avg       0.63      0.56      0.54    114271
weighted avg       0.64      0.66      0.60    114271


Feature Importance:
                      Feature  Importance
9                  AIRLINE_DL    0.027583
14                 AIRLINE_NK    0.013598
19                 AIRLINE_WN    0.013580
13                 AIRLINE_MQ    0.011632
11                 AIRLINE_F9    0.010553
...                       ...         ...
499        ORIGIN_AIRPORT_LAW    0.000000
502        ORIGIN_AIRPORT_LBE    0.000000
503        ORIGIN_AIRPORT_LCH    0.000000
504        ORIGIN_AIRPORT_LEX    0.000000
1269  DESTINATION_AIRPORT_YUM    0.000000

[1270 rows x 2 columns]


In [49]:
#Create new feature set based on output of feature importance. In this case we learn that airline, origin airport, and destination airport do not have a significant impact on predicting flight delays

flights_x2 = flights_x[['SCHEDULED_ARRIVAL', 'SCHEDULED_DEPARTURE', 'DISTANCE', 'SCHEDULED_TIME', 'DAY_OF_WEEK', 'MONTH']]

In [23]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x2, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x2.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.92      0.77     72612
           1       0.55      0.17      0.25     41659

    accuracy                           0.65    114271
   macro avg       0.60      0.54      0.51    114271
weighted avg       0.62      0.65      0.58    114271


Feature Importance:
               Feature  Importance
1  SCHEDULED_DEPARTURE    0.297044
5                MONTH    0.222878
4          DAY_OF_WEEK    0.137363
2             DISTANCE    0.121850
3       SCHEDULED_TIME    0.113300
0    SCHEDULED_ARRIVAL    0.107565


In [50]:
#Check correlation of new feature set to ensure there is no multi-colinearity 

flights_x2.corr()

Unnamed: 0,SCHEDULED_ARRIVAL,SCHEDULED_DEPARTURE,DISTANCE,SCHEDULED_TIME,DAY_OF_WEEK,MONTH
SCHEDULED_ARRIVAL,1.0,0.70467,0.027449,0.03228,0.004508,-0.009825
SCHEDULED_DEPARTURE,0.70467,1.0,-0.008144,-0.015471,0.004343,0.001248
DISTANCE,0.027449,-0.008144,1.0,0.984394,0.015906,0.009673
SCHEDULED_TIME,0.03228,-0.015471,0.984394,1.0,0.014689,0.009515
DAY_OF_WEEK,0.004508,0.004343,0.015906,0.014689,1.0,-0.010111
MONTH,-0.009825,0.001248,0.009673,0.009515,-0.010111,1.0


In [54]:
#Create feature set removing "Scheduled Time" as it is highly correlated with "Distance"

flights_x3 = flights_x[['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DISTANCE', 'DAY_OF_WEEK', 'MONTH']]

In [55]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x3, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x3.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.93      0.77     72612
           1       0.54      0.14      0.23     41659

    accuracy                           0.64    114271
   macro avg       0.60      0.54      0.50    114271
weighted avg       0.61      0.64      0.57    114271


Feature Importance:
               Feature  Importance
0  SCHEDULED_DEPARTURE    0.327178
4                MONTH    0.259893
3          DAY_OF_WEEK    0.169173
1    SCHEDULED_ARRIVAL    0.123690
2             DISTANCE    0.120065


In [56]:
flights_x3.corr()

Unnamed: 0,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DISTANCE,DAY_OF_WEEK,MONTH
SCHEDULED_DEPARTURE,1.0,0.70467,-0.008144,0.004343,0.001248
SCHEDULED_ARRIVAL,0.70467,1.0,0.027449,0.004508,-0.009825
DISTANCE,-0.008144,0.027449,1.0,0.015906,0.009673
DAY_OF_WEEK,0.004343,0.004508,0.015906,1.0,-0.010111
MONTH,0.001248,-0.009825,0.009673,-0.010111,1.0


In [58]:
#Create two more feature sets to test whether "Scheduled Arrival" or "Scheduled Departure" time has a greater effect on model prediction

flights_x4 = flights_x[['SCHEDULED_ARRIVAL', 'DISTANCE', 'DAY_OF_WEEK', 'MONTH']]

flights_x5 = flights_x[['SCHEDULED_DEPARTURE', 'DISTANCE', 'DAY_OF_WEEK', 'MONTH']]

In [59]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x4, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x4.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.93      0.77     72612
           1       0.53      0.14      0.22     41659

    accuracy                           0.64    114271
   macro avg       0.59      0.53      0.49    114271
weighted avg       0.61      0.64      0.57    114271


Feature Importance:
             Feature  Importance
0  SCHEDULED_ARRIVAL    0.321754
3              MONTH    0.309801
2        DAY_OF_WEEK    0.208673
1           DISTANCE    0.159773


In [60]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x5, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x5.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.93      0.77     72612
           1       0.54      0.14      0.22     41659

    accuracy                           0.64    114271
   macro avg       0.60      0.54      0.49    114271
weighted avg       0.61      0.64      0.57    114271


Feature Importance:
               Feature  Importance
0  SCHEDULED_DEPARTURE    0.342488
3                MONTH    0.313034
2          DAY_OF_WEEK    0.198500
1             DISTANCE    0.145977


In [61]:
flights_x4.corr()

Unnamed: 0,SCHEDULED_ARRIVAL,DISTANCE,DAY_OF_WEEK,MONTH
SCHEDULED_ARRIVAL,1.0,0.027449,0.004508,-0.009825
DISTANCE,0.027449,1.0,0.015906,0.009673
DAY_OF_WEEK,0.004508,0.015906,1.0,-0.010111
MONTH,-0.009825,0.009673,-0.010111,1.0


In [62]:
flights_x5.corr()

Unnamed: 0,SCHEDULED_DEPARTURE,DISTANCE,DAY_OF_WEEK,MONTH
SCHEDULED_DEPARTURE,1.0,-0.008144,0.004343,0.001248
DISTANCE,-0.008144,1.0,0.015906,0.009673
DAY_OF_WEEK,0.004343,0.015906,1.0,-0.010111
MONTH,0.001248,0.009673,-0.010111,1.0


Based on the results we see above there is no significant benefit between variables on either model accuracy or prediction. Based on this we will go with Feature Set #5 (Scheduled Departure) as the final model. The next step is to tune the hyperparameters to see if we can improve model accuracy

In [63]:
#Experimenting with hyperparameter tuning of the model

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(flights_x5, flights_y, test_size=0.2, random_state=42)

# Initialize and train an XGBoost classifier
model2 = XGBClassifier(learning_rate = 0.001, max_depth = 10, n_estimators = 10000)
model2.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model2.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

# Feature Importance
feature_importance = pd.DataFrame({'Feature': flights_x5.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.92      0.77     72612
           1       0.54      0.15      0.24     41659

    accuracy                           0.64    114271
   macro avg       0.60      0.54      0.50    114271
weighted avg       0.61      0.64      0.58    114271


Feature Importance:
               Feature  Importance
0  SCHEDULED_DEPARTURE    0.342488
3                MONTH    0.313034
2          DAY_OF_WEEK    0.198500
1             DISTANCE    0.145977


Based on the results of the model that had its hyperparameters tuned we can see that it does not improve overall accuracy, giving confidence in the accuracy results of our overall model