In [33]:
#Import base packages for analysis

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [43]:
#Import 2015 flight data from CSV file

dtype_options = {'AIRLINE': 'object', 'TAIL_NUMBER': 'object', 'ORIGIN_AIRPORT': 'object', 'DESTINATION_AIRPORT': 'object'}
flights = pd.read_csv('Data/flights.csv', dtype = dtype_options)

In [44]:
#Check shape of flight file

flights.shape

(5819079, 31)

In [45]:
#Check head of file to ensure dates begin on January 01, 2015

flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [46]:
#Check tail of file to ensure dates end on December 31, 2015

flights.tail()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
5819074,2015,12,31,4,B6,688,N657JB,LAX,BOS,2359,...,753.0,-26.0,0,0,,,,,,
5819075,2015,12,31,4,B6,745,N828JB,JFK,PSE,2359,...,430.0,-16.0,0,0,,,,,,
5819076,2015,12,31,4,B6,1503,N913JB,JFK,SJU,2359,...,432.0,-8.0,0,0,,,,,,
5819077,2015,12,31,4,B6,333,N527JB,MCO,SJU,2359,...,330.0,-10.0,0,0,,,,,,
5819078,2015,12,31,4,B6,839,N534JB,JFK,BQN,2359,...,442.0,2.0,0,0,,,,,,


### Conducting EDA ####

The next phase in the project is to conduct EDA on the overall dataset to inform which columns can be removed prior to model building and feature analysis

In [47]:
#Check which columns have too many rows of empty data

flights.isna().sum()

YEAR                         0
MONTH                        0
DAY                          0
DAY_OF_WEEK                  0
AIRLINE                      0
FLIGHT_NUMBER                0
TAIL_NUMBER              14721
ORIGIN_AIRPORT               0
DESTINATION_AIRPORT          0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME           86153
DEPARTURE_DELAY          86153
TAXI_OUT                 89047
WHEELS_OFF               89047
SCHEDULED_TIME               6
ELAPSED_TIME            105071
AIR_TIME                105071
DISTANCE                     0
WHEELS_ON                92513
TAXI_IN                  92513
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME             92513
ARRIVAL_DELAY           105071
DIVERTED                     0
CANCELLED                    0
CANCELLATION_REASON    5729195
AIR_SYSTEM_DELAY       4755640
SECURITY_DELAY         4755640
AIRLINE_DELAY          4755640
LATE_AIRCRAFT_DELAY    4755640
WEATHER_DELAY          4755640
dtype: int64

In [51]:
#Based on analysis above, we can remove the columns below as they are mostly blank and will delete the entire dataset of we clean all columns without data. Also removing columns such as "Airline", "Origin Airport", "Destination Airport", and "Tail Number" such those aren't numerical and cannot be part of EDA

columns_to_drop = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'TAIL_NUMBER', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DEPARTURE_TIME']
flights.drop(columns = columns_to_drop, inplace = True)

KeyError: "['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'TAIL_NUMBER', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DEPARTURE_TIME'] not found in axis"

In [49]:
#Clean data by dropping all rows with empty data

flights.dropna(inplace = True)

In [50]:
flights = flights.astype(int)

In [52]:
flights.corr()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED
YEAR,,,,,,,,,,,,,,,,,,,,
MONTH,,1.0,0.005315,-0.009427,-0.019988,-0.000214,-0.021827,-0.013069,-0.004913,0.010337,0.001871,0.003379,0.010612,-0.00895,0.001937,-0.010361,-0.009164,-0.036793,,
DAY,,0.005315,1.0,0.002099,0.002916,-0.001793,-0.000257,-0.002594,-0.003322,0.002888,0.001419,0.001923,0.002977,-0.00432,-0.001868,-0.003878,-0.004335,-0.003097,,
DAY_OF_WEEK,,-0.009427,0.002099,1.0,0.015153,0.007737,-0.01145,-0.020648,0.004048,0.014611,0.01151,0.014284,0.015931,0.005617,0.001034,0.006874,0.005621,-0.017027,,
FLIGHT_NUMBER,,-0.019988,0.002916,0.015153,1.0,-0.006971,-0.008889,0.050418,0.005302,-0.316079,-0.306611,-0.319681,-0.329824,-0.006199,-0.019736,-0.015119,-0.002896,0.018419,,
SCHEDULED_DEPARTURE,,-0.000214,-0.001793,0.007737,-0.006971,1.0,0.110419,0.006551,0.938134,-0.015846,-0.017935,-0.015756,-0.008854,0.657801,-0.044686,0.705042,0.631086,0.10022,,
DEPARTURE_DELAY,,-0.021827,-0.000257,-0.01145,-0.008889,0.110419,1.0,0.058423,0.162664,0.027582,0.030805,0.023495,0.023957,0.059091,0.012479,0.097689,0.049652,0.944672,,
TAXI_OUT,,-0.013069,-0.002594,-0.020648,0.050418,0.006551,0.058423,1.0,0.039007,0.112274,0.205195,0.087608,0.072284,0.032684,0.003065,0.024307,0.029863,0.227319,,
WHEELS_OFF,,-0.004913,-0.003322,0.004048,0.005302,0.938134,0.162664,0.039007,1.0,-0.028287,-0.027133,-0.029654,-0.027562,0.697496,-0.038824,0.72184,0.669799,0.155773,,
SCHEDULED_TIME,,0.010337,0.002888,0.014611,-0.316079,-0.015846,0.027582,0.112274,-0.028287,1.0,0.98526,0.990749,0.984438,0.02351,0.09919,0.032322,0.020561,-0.030029,,


Based on the correlation matrix above, it supports dropping highly correlated parameters such as "Taxi Out", "Wheels Off", "Elapsed Time", "Air Time", "Wheels On", "Taxi In", "Arrival Time" along with the additional parameters that were previously dropped. The next step of the project is to proceed with this and build a sample model for testing

### Sample Model Building ####

In this step we aim to create a sample test model and explore what features are best for predicting which flights are delayed and/or on time

In [53]:
#Import 2015 flight data from CSV file

dtype_options = {'AIRLINE': 'object', 'TAIL_NUMBER': 'object', 'ORIGIN_AIRPORT': 'object', 'DESTINATION_AIRPORT': 'object'}
flights = pd.read_csv('Data/flights.csv', dtype = dtype_options)

In [54]:
#Create sample size for flights DataFrame to speed up model testing and evaluation

flights = flights.sample(frac = 0.1, random_state = 42)

In [55]:
#Confirm new model size has ~10% rows of main dataset

flights.shape

(581908, 31)

In [56]:
#One-hot encode categorical variables airlines, origin airport, and destination airport

flights2 = pd.get_dummies(flights, columns=['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'])

In [57]:
#Check shape of new DataFrame to confirm one-hot encoding was successfully executed

flights2.shape

(581908, 1292)

In [58]:
#Check head of file to confirm one-hot encoding for categorical variables to be used in the analysis

flights2.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,TAIL_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,...,DESTINATION_AIRPORT_TYS,DESTINATION_AIRPORT_UST,DESTINATION_AIRPORT_VEL,DESTINATION_AIRPORT_VLD,DESTINATION_AIRPORT_VPS,DESTINATION_AIRPORT_WRG,DESTINATION_AIRPORT_WYS,DESTINATION_AIRPORT_XNA,DESTINATION_AIRPORT_YAK,DESTINATION_AIRPORT_YUM
1508570,2015,4,7,2,4900,N759EV,1340,1335.0,-5.0,13.0,...,False,False,False,False,False,False,False,False,False,False
363270,2015,1,24,6,611,N413AS,1910,1858.0,-12.0,14.0,...,False,False,False,False,False,False,False,False,False,False
3003945,2015,7,8,3,1483,N463WN,630,626.0,-4.0,8.0,...,False,False,False,False,False,False,False,False,False,False
2291425,2015,5,26,2,193,N7745A,810,1043.0,153.0,6.0,...,False,False,False,False,False,False,False,False,False,False
2973084,2015,7,6,1,253,N213UA,1000,1008.0,8.0,16.0,...,False,False,False,False,False,False,False,False,False,False


In [59]:
#Drop irrelevant columns from analysis that do not intuitively impact whether or not a flight will be on time or late 

columns_to_drop = ['YEAR', 'DAY', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'TAXI_IN', 'ARRIVAL_TIME', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DEPARTURE_TIME']
flights2.drop(columns = columns_to_drop, inplace = True)

In [60]:
#Clean data by dropping all rows with empty data

flights2.dropna(inplace = True)

In [61]:
#Further cleaning by casting the entire DataFrame as an integer type

flights3 = flights2.astype(int)

In [62]:
#Remove "Arrival Delay" column as this will be our dependent variable

flights_x = flights3.drop(columns = ['ARRIVAL_DELAY'])

In [63]:
#Create new DataFrame for dependent variable

flights_y = flights3['ARRIVAL_DELAY']
flights_y = pd.DataFrame(flights_y)

In [64]:
#Classify flights as late if their arrival delay time is > 0 minutes (Value = 1) otherwise if on-time or early (Value = 0)

flights_y['IS_LATE'] = (flights_y['ARRIVAL_DELAY'] > 0).astype(int)

In [65]:
#Check head of file to confirm flights with negative arrival delays are classified as 0 and flights with positive values are classified as 1

flights_y.head()

Unnamed: 0,ARRIVAL_DELAY,IS_LATE
1508570,-13,0
363270,-12,0
3003945,-8,0
2291425,152,1
2973084,-2,0


In [66]:
#Drop "Arrival Delay" column from analysis

flights_y = flights_y['IS_LATE']

In [67]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.66
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.92      0.77     72612
           1       0.59      0.20      0.30     41659

    accuracy                           0.66    114271
   macro avg       0.63      0.56      0.54    114271
weighted avg       0.64      0.66      0.60    114271


Feature Importance:
                      Feature  Importance
9                  AIRLINE_DL    0.027583
14                 AIRLINE_NK    0.013598
19                 AIRLINE_WN    0.013580
13                 AIRLINE_MQ    0.011632
11                 AIRLINE_F9    0.010553
...                       ...         ...
499        ORIGIN_AIRPORT_LAW    0.000000
502        ORIGIN_AIRPORT_LBE    0.000000
503        ORIGIN_AIRPORT_LCH    0.000000
504        ORIGIN_AIRPORT_LEX    0.000000
1269  DESTINATION_AIRPORT_YUM    0.000000

[1270 rows x 2 columns]


In [68]:
#Create new feature set based on output of feature importance. In this case we learn that airline, origin airport, and destination airport do not have a significant impact on predicting flight delays

flights_x2 = flights_x[['SCHEDULED_ARRIVAL', 'SCHEDULED_DEPARTURE', 'DISTANCE', 'SCHEDULED_TIME', 'DAY_OF_WEEK', 'MONTH']]

In [69]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x2, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x2.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.92      0.77     72612
           1       0.55      0.17      0.25     41659

    accuracy                           0.65    114271
   macro avg       0.60      0.54      0.51    114271
weighted avg       0.62      0.65      0.58    114271


Feature Importance:
               Feature  Importance
1  SCHEDULED_DEPARTURE    0.297044
5                MONTH    0.222878
4          DAY_OF_WEEK    0.137363
2             DISTANCE    0.121850
3       SCHEDULED_TIME    0.113300
0    SCHEDULED_ARRIVAL    0.107565


In [70]:
#Check correlation of new feature set to ensure there is no multi-colinearity 

flights_x2.corr()

Unnamed: 0,SCHEDULED_ARRIVAL,SCHEDULED_DEPARTURE,DISTANCE,SCHEDULED_TIME,DAY_OF_WEEK,MONTH
SCHEDULED_ARRIVAL,1.0,0.70467,0.027449,0.03228,0.004508,-0.009825
SCHEDULED_DEPARTURE,0.70467,1.0,-0.008144,-0.015471,0.004343,0.001248
DISTANCE,0.027449,-0.008144,1.0,0.984394,0.015906,0.009673
SCHEDULED_TIME,0.03228,-0.015471,0.984394,1.0,0.014689,0.009515
DAY_OF_WEEK,0.004508,0.004343,0.015906,0.014689,1.0,-0.010111
MONTH,-0.009825,0.001248,0.009673,0.009515,-0.010111,1.0


In [71]:
#Create feature set removing "Scheduled Time" as it is highly correlated with "Distance"

flights_x3 = flights_x[['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DISTANCE', 'DAY_OF_WEEK', 'MONTH']]

In [72]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x3, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x3.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.93      0.77     72612
           1       0.54      0.14      0.23     41659

    accuracy                           0.64    114271
   macro avg       0.60      0.54      0.50    114271
weighted avg       0.61      0.64      0.57    114271


Feature Importance:
               Feature  Importance
0  SCHEDULED_DEPARTURE    0.327178
4                MONTH    0.259893
3          DAY_OF_WEEK    0.169173
1    SCHEDULED_ARRIVAL    0.123690
2             DISTANCE    0.120065


In [73]:
flights_x3.corr()

Unnamed: 0,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DISTANCE,DAY_OF_WEEK,MONTH
SCHEDULED_DEPARTURE,1.0,0.70467,-0.008144,0.004343,0.001248
SCHEDULED_ARRIVAL,0.70467,1.0,0.027449,0.004508,-0.009825
DISTANCE,-0.008144,0.027449,1.0,0.015906,0.009673
DAY_OF_WEEK,0.004343,0.004508,0.015906,1.0,-0.010111
MONTH,0.001248,-0.009825,0.009673,-0.010111,1.0


In [74]:
#Create two more feature sets to test whether "Scheduled Arrival" or "Scheduled Departure" time has a greater effect on model prediction

flights_x4 = flights_x[['SCHEDULED_ARRIVAL', 'DISTANCE', 'DAY_OF_WEEK', 'MONTH']]

flights_x5 = flights_x[['SCHEDULED_DEPARTURE', 'DISTANCE', 'DAY_OF_WEEK', 'MONTH']]

In [75]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x4, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x4.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.93      0.77     72612
           1       0.53      0.14      0.22     41659

    accuracy                           0.64    114271
   macro avg       0.59      0.53      0.49    114271
weighted avg       0.61      0.64      0.57    114271


Feature Importance:
             Feature  Importance
0  SCHEDULED_ARRIVAL    0.321754
3              MONTH    0.309801
2        DAY_OF_WEEK    0.208673
1           DISTANCE    0.159773


In [76]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(flights_x5, flights_y, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': flights_x5.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.93      0.77     72612
           1       0.54      0.14      0.22     41659

    accuracy                           0.64    114271
   macro avg       0.60      0.54      0.49    114271
weighted avg       0.61      0.64      0.57    114271


Feature Importance:
               Feature  Importance
0  SCHEDULED_DEPARTURE    0.342488
3                MONTH    0.313034
2          DAY_OF_WEEK    0.198500
1             DISTANCE    0.145977


In [77]:
flights_x4.corr()

Unnamed: 0,SCHEDULED_ARRIVAL,DISTANCE,DAY_OF_WEEK,MONTH
SCHEDULED_ARRIVAL,1.0,0.027449,0.004508,-0.009825
DISTANCE,0.027449,1.0,0.015906,0.009673
DAY_OF_WEEK,0.004508,0.015906,1.0,-0.010111
MONTH,-0.009825,0.009673,-0.010111,1.0


In [78]:
flights_x5.corr()

Unnamed: 0,SCHEDULED_DEPARTURE,DISTANCE,DAY_OF_WEEK,MONTH
SCHEDULED_DEPARTURE,1.0,-0.008144,0.004343,0.001248
DISTANCE,-0.008144,1.0,0.015906,0.009673
DAY_OF_WEEK,0.004343,0.015906,1.0,-0.010111
MONTH,0.001248,0.009673,-0.010111,1.0


Based on the results we see above there is no significant benefit between variables on either model accuracy or prediction. Based on this we will go with Feature Set #5 (Scheduled Departure) as the final model. The next step is to tune the hyperparameters to see if we can improve model accuracy

In [79]:
#Experimenting with hyperparameter tuning of the model

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(flights_x5, flights_y, test_size=0.2, random_state=42)

# Initialize and train an XGBoost classifier
model2 = XGBClassifier(learning_rate = 0.001, max_depth = 10, n_estimators = 10000)
model2.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model2.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

# Feature Importance
feature_importance = pd.DataFrame({'Feature': flights_x5.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.92      0.77     72612
           1       0.54      0.15      0.24     41659

    accuracy                           0.64    114271
   macro avg       0.60      0.54      0.50    114271
weighted avg       0.61      0.64      0.58    114271


Feature Importance:
               Feature  Importance
0  SCHEDULED_DEPARTURE    0.342488
3                MONTH    0.313034
2          DAY_OF_WEEK    0.198500
1             DISTANCE    0.145977


Based on the results of the model that had its hyperparameters tuned we can see that it does not improve overall accuracy, giving confidence in the accuracy results of our overall model

In [103]:
#Validate classification accuracy with manual test

test_df = pd.DataFrame({'Y_Pred': y_pred, 'Y_Test': y_test})

In [104]:
test_df['Accuracy'] = test_df['Y_Pred'] == test_df['Y_Test']
test_df['Accuracy'] = test_df['Accuracy'].astype(int)
test_df['Accuracy'].mean()

0.6434878490605666

Based on the output, we can see that we are getting approximately the same accuracy of 0.64 or 64% prediction accuracy

### Final Model Creation

In [105]:
#Import 2015 flight data from CSV file

dtype_options = {'AIRLINE': 'object', 'TAIL_NUMBER': 'object', 'ORIGIN_AIRPORT': 'object', 'DESTINATION_AIRPORT': 'object'}
flights = pd.read_csv('Data/flights.csv', dtype = dtype_options)

In [120]:
#Create final feature set based on testing parameters for full model evaluation

flights_final = flights[['ARRIVAL_DELAY', 'SCHEDULED_DEPARTURE', 'DISTANCE', 'DAY_OF_WEEK', 'MONTH']]

In [121]:
#Confirm no missing values or datatype errors are present

flights_final.isna().sum()

ARRIVAL_DELAY          105071
SCHEDULED_DEPARTURE         0
DISTANCE                    0
DAY_OF_WEEK                 0
MONTH                       0
dtype: int64

In [131]:
#Create feature set for independent variable

ffx = flights_final[['SCHEDULED_DEPARTURE', 'DISTANCE', 'DAY_OF_WEEK', 'MONTH']]

In [134]:
#Create output set for dependent variable

ffy = flights_final['ARRIVAL_DELAY']

In [135]:
#Create new DataFrame for dependent variable

ffy = pd.DataFrame(ffy)

In [136]:
#Classify flights as late if their arrival delay time is > 0 minutes (Value = 1) otherwise if on-time or early (Value = 0)

ffy['IS_LATE'] = (ffy['ARRIVAL_DELAY'] > 0).astype(int)

In [137]:
#Check head of file to confirm flights with negative arrival delays are classified as 0 and flights with positive values are classified as 1

ffy.head()

Unnamed: 0,ARRIVAL_DELAY,IS_LATE
0,-22.0,0
1,-9.0,0
2,5.0,1
3,-9.0,0
4,-21.0,0


In [138]:
#Drop "Arrival Delay" column from analysis

ffy = ffy['IS_LATE']

In [139]:
#Split the data into training and testing sets to validate approach
X_train, X_test, y_train, y_test = train_test_split(ffx, ffy, test_size=0.2, random_state=42)

#Initialize and train an XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

#Display model accuracy and classification report results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_result)

#Show feature based on features to determine which features impact prediction results
feature_importance = pd.DataFrame({'Feature': ffx.columns, 'Importance': model.feature_importances_})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.95      0.78    746293
           1       0.55      0.11      0.18    417523

    accuracy                           0.65   1163816
   macro avg       0.60      0.53      0.48   1163816
weighted avg       0.62      0.65      0.56   1163816


Feature Importance:
               Feature  Importance
0  SCHEDULED_DEPARTURE    0.357510
3                MONTH    0.293418
2          DAY_OF_WEEK    0.240834
1             DISTANCE    0.108238
