In [7]:
import pandas as pd

In [20]:
data = pd.read_csv(r'D:\Business Analytics\Capstone Project\Datasets\flight_data.csv')

In [21]:
data.head()

Unnamed: 0,sched_dep_time,carrier,dep_time,dest,distance,Date,flight,origin,Weather,Weekday,tailnum,Flight status
0,515,UA,517,IAH,1400,01-01-2013,1545,EWR,1,2,N14228,Delayed
1,529,UA,533,IAH,1416,01-01-2013,1714,LGA,1,2,N24211,Delayed
2,540,AA,542,MIA,1089,01-01-2013,1141,JFK,0,2,N619AA,Delayed
3,545,B6,544,BQN,1576,01-01-2013,725,JFK,0,2,N804JB,Delayed
4,600,DL,554,ATL,762,01-01-2013,461,LGA,1,2,N668DN,Delayed


In [22]:
data.describe()

Unnamed: 0,sched_dep_time,dep_time,distance,flight,Weather,Weekday
count,328521.0,328521.0,328521.0,328521.0,328521.0,328521.0
mean,1340.535762,1349.109947,1048.5706,1944.502893,0.500196,3.895492
std,467.2843,488.281791,735.908891,1621.914343,0.500001,1.988307
min,500.0,1.0,80.0,1.0,0.0,1.0
25%,905.0,907.0,509.0,544.0,0.0,2.0
50%,1355.0,1401.0,888.0,1471.0,1.0,4.0
75%,1729.0,1744.0,1389.0,3416.0,1.0,6.0
max,2359.0,2400.0,4983.0,8500.0,1.0,7.0


In [23]:
## Import required packages
%matplotlib inline

from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pylab as plt
from dmba import classificationSummary, gainsChart


# Creating Dummies & Partitioning Data

In [25]:

# convert to categorical
data.Weekday = data.Weekday.astype('category')
data['Flight status'] = data['Flight status'].astype('category')

# create hourly bins departure time 
data.dep_time= [round(t / 100) for t in data.dep_time]
data.dep_time = data.dep_time.astype('category')

predictors = ['Weekday', 'dep_time', 'origin', 'dest', 'carrier']
outcome = 'Flight status'

X = pd.get_dummies(data[predictors])
y = data['Flight status']
classes = list(y.cat.categories)

# split into training and validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)

# run naive Bayes
data_nb = MultinomialNB(alpha=0.01)
data_nb.fit(X_train, y_train)

# predict probabilities
predProb_train = data_nb.predict_proba(X_train)
predProb_valid = data_nb.predict_proba(X_valid)

# predict class membership
y_valid_pred = data_nb.predict(X_valid)
y_train_pred = data_nb.predict(X_train)

# Conditional Probabilities

In [33]:
# split the original data frame into a train and test using the same random_state
train_df, valid_df = train_test_split(data, test_size=0.4, random_state=1)

pd.set_option('precision', 4)
# probability of flight status
print(train_df['Flight status'].value_counts() / len(train_df))
print()

for predictor in predictors:
    # construct the frequency table
    df = train_df[['Flight status', predictor]]
    freqTable = df.pivot_table(index='Flight status', columns=predictor, aggfunc=len)

    # divide each row by the sum of the row to get conditional probabilities
    propTable = freqTable.apply(lambda x: x / sum(x), axis=1)
    print(propTable)
    print()
pd.reset_option('precision')

Delayed    0.9495
Ontime     0.0505
Name: Flight status, dtype: float64

Weekday             1       2       3       4       5       6       7
Flight status                                                        
Delayed        0.1508  0.1505  0.1488  0.1472  0.1485  0.1152  0.1391
Ontime         0.1442  0.1498  0.1476  0.1483  0.1452  0.1260  0.1388

dep_time         0
Flight status     
Delayed        1.0
Ontime         1.0

origin            EWR     JFK     LGA
Flight status                        
Delayed        0.3596  0.3303  0.3101
Ontime         0.3379  0.3777  0.2844

dest              ABQ     ACK     ALB         ANC     ATL     AUS     AVL  \
Flight status                                                               
Delayed        0.0008  0.0008  0.0013  1.0686e-05  0.0508  0.0075  0.0008   
Ontime            NaN     NaN     NaN         NaN     NaN     NaN     NaN   

dest              BDL     BGR     BHM  ...     SNA     SRQ     STL     STT  \
Flight status                

# Naive Bayes Probabilities 

In [34]:
# P(delayed | Carrier = DL, Day_Week = 7, Dep_Time = 10, Dest = LGA, Origin = DCA)
P_hat_Delayed = 0.0958 * 0.1609 * 0.0307 * 0.4215 * 0.5211 * 0.1977
# P(ontime | Carrier = DL, Day_Week = 7, Dep_Time = 10, Dest = LGA, Origin = DCA)
P_hat_Ontime = 0.2040 * 0.1048 * 0.0519 * 0.5779 * 0.6478 * 0.8023
print('P_hat_Delayed ~ ', P_hat_Delayed)
print('P_hat_Ontime ~ ', P_hat_Ontime)

print('P(Delayed|...) = ', P_hat_Delayed / (P_hat_Delayed + P_hat_Ontime))
print('P(Ontime|...) = ', P_hat_Ontime / (P_hat_Delayed + P_hat_Ontime))

P_hat_Delayed ~  2.0548742506526157e-05
P_hat_Ontime ~  0.00033326464123921066
P(Delayed|...) =  0.05807791183301656
P(Ontime|...) =  0.9419220881669834


# Classification Summary

In [35]:
# Training Sample
classificationSummary(y_train, y_train_pred, class_names=classes) 

# Validation Sample
classificationSummary(y_valid, y_valid_pred, class_names=classes) 

Confusion Matrix (Accuracy 0.9495)

        Prediction
 Actual Delayed  Ontime
Delayed  187154       0
 Ontime    9958       0
Confusion Matrix (Accuracy 0.9501)

        Prediction
 Actual Delayed  Ontime
Delayed  124853       0
 Ontime    6556       0
