In [78]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt # plotting
import numpy as np
import pandas as pd
from IPython.display import display

In [79]:
def distribution(data, continous_feats ,transformed = False):
    """
    Visualization code for displaying skewed distributions of features
    """
    
    # Create figure
    fig = plt.figure(figsize = (11,5))

    # Skewed feature plotting
    for i, feature in enumerate(continous_feats):
        ax = fig.add_subplot(1, 4, i+1)
        ax.hist(data[feature], bins = 25, color = '#00A0A0')
        ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
        ax.set_xlabel("Value")
        ax.set_ylabel("Number of Records")
        ax.set_ylim((0, 2000))
        ax.set_yticks([0, 500, 1000, 1500, 2000])
        ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])

    # Plot aesthetics
    if transformed:
        fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \
            fontsize = 16, y = 1.03)
    else:
        fig.suptitle("Skewed Distributions of Continuous Census Data Features", \
            fontsize = 16, y = 1.03)

    fig.tight_layout()
    fig.show()

In [80]:
dataset_17 = pd.read_csv("./Dataset/2017.csv")
dataset_18 = pd.read_csv("./Dataset/2018.csv")

dataset = dataset_17.append(dataset_18, ignore_index=True)


In [81]:
print("shape :",dataset.shape[0],dataset.shape[1])

shape : 12888067 28


In [82]:

display(dataset.head(n=10))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,2017-01-01,AA,1,JFK,LAX,800,831.0,31.0,25.0,856.0,...,402.0,398.0,347.0,2475.0,27.0,0.0,0.0,0.0,0.0,
1,2017-01-01,AA,2,LAX,JFK,900,934.0,34.0,34.0,1008.0,...,327.0,335.0,289.0,2475.0,34.0,0.0,8.0,0.0,0.0,
2,2017-01-01,AA,4,LAX,JFK,1130,1221.0,51.0,20.0,1241.0,...,328.0,319.0,284.0,2475.0,7.0,0.0,0.0,0.0,35.0,
3,2017-01-01,AA,5,DFW,HNL,1135,1252.0,77.0,19.0,1311.0,...,517.0,537.0,513.0,3784.0,77.0,0.0,20.0,0.0,0.0,
4,2017-01-01,AA,6,OGG,DFW,1855,1855.0,0.0,16.0,1911.0,...,425.0,467.0,440.0,3711.0,0.0,0.0,42.0,0.0,0.0,
5,2017-01-01,AA,7,DFW,OGG,940,1619.0,399.0,12.0,1631.0,...,503.0,498.0,480.0,3711.0,394.0,0.0,0.0,0.0,0.0,
6,2017-01-01,AA,8,HNL,DFW,1838,1903.0,25.0,19.0,1922.0,...,432.0,465.0,434.0,3784.0,25.0,0.0,33.0,0.0,0.0,
7,2017-01-01,AA,9,JFK,SFO,700,656.0,-4.0,22.0,718.0,...,405.0,387.0,362.0,2586.0,,,,,,
8,2017-01-01,AA,10,LAX,JFK,2100,2100.0,0.0,15.0,2115.0,...,327.0,297.0,272.0,2475.0,,,,,,
9,2017-01-01,AA,12,SFO,JFK,1135,1130.0,-5.0,27.0,1157.0,...,343.0,324.0,280.0,2586.0,,,,,,


In [83]:
#drop columns that all have NA values
dataset.dropna(axis='columns',how="all",inplace=True)


In [84]:
display(dataset.head(n=10))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2017-01-01,AA,1,JFK,LAX,800,831.0,31.0,25.0,856.0,...,0.0,402.0,398.0,347.0,2475.0,27.0,0.0,0.0,0.0,0.0
1,2017-01-01,AA,2,LAX,JFK,900,934.0,34.0,34.0,1008.0,...,0.0,327.0,335.0,289.0,2475.0,34.0,0.0,8.0,0.0,0.0
2,2017-01-01,AA,4,LAX,JFK,1130,1221.0,51.0,20.0,1241.0,...,0.0,328.0,319.0,284.0,2475.0,7.0,0.0,0.0,0.0,35.0
3,2017-01-01,AA,5,DFW,HNL,1135,1252.0,77.0,19.0,1311.0,...,0.0,517.0,537.0,513.0,3784.0,77.0,0.0,20.0,0.0,0.0
4,2017-01-01,AA,6,OGG,DFW,1855,1855.0,0.0,16.0,1911.0,...,0.0,425.0,467.0,440.0,3711.0,0.0,0.0,42.0,0.0,0.0
5,2017-01-01,AA,7,DFW,OGG,940,1619.0,399.0,12.0,1631.0,...,0.0,503.0,498.0,480.0,3711.0,394.0,0.0,0.0,0.0,0.0
6,2017-01-01,AA,8,HNL,DFW,1838,1903.0,25.0,19.0,1922.0,...,0.0,432.0,465.0,434.0,3784.0,25.0,0.0,33.0,0.0,0.0
7,2017-01-01,AA,9,JFK,SFO,700,656.0,-4.0,22.0,718.0,...,0.0,405.0,387.0,362.0,2586.0,,,,,
8,2017-01-01,AA,10,LAX,JFK,2100,2100.0,0.0,15.0,2115.0,...,0.0,327.0,297.0,272.0,2475.0,,,,,
9,2017-01-01,AA,12,SFO,JFK,1135,1130.0,-5.0,27.0,1157.0,...,0.0,343.0,324.0,280.0,2586.0,,,,,


In [85]:
#print columns of dataset and their datatype
print(dataset.dtypes)

FL_DATE                 object
OP_CARRIER              object
OP_CARRIER_FL_NUM        int64
ORIGIN                  object
DEST                    object
CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY              float64
TAXI_OUT               float64
WHEELS_OFF             float64
WHEELS_ON              float64
TAXI_IN                float64
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY              float64
CANCELLED              float64
CANCELLATION_CODE       object
DIVERTED               float64
CRS_ELAPSED_TIME       float64
ACTUAL_ELAPSED_TIME    float64
AIR_TIME               float64
DISTANCE               float64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
dtype: object


In [86]:
#print percantage of NA values for each feature
for col in dataset.columns:
   print(col,(sum(pd.isna(dataset[col])/dataset.shape[0])*100),"%")

FL_DATE 0.0 %
OP_CARRIER 0.0 %
OP_CARRIER_FL_NUM 0.0 %
ORIGIN 0.0 %
DEST 0.0 %
CRS_DEP_TIME 0.0 %
DEP_TIME 1.4945996168338376 %
DEP_DELAY 1.5330227566262846 %
TAXI_OUT 1.5361108845821396 %
WHEELS_OFF 1.5360720890048047 %
WHEELS_ON 1.5822388260337314 %
TAXI_IN 1.5822388260337314 %
CRS_ARR_TIME 0.0 %
ARR_TIME 1.5822310669182642 %
ARR_DELAY 1.8020623263334392 %
CANCELLED 0.0 %
CANCELLATION_CODE 98.45378677874118 %
DIVERTED 0.0 %
CRS_ELAPSED_TIME 0.0001319049629397489 %
ACTUAL_ELAPSED_TIME 1.7819041443497945 %
AIR_TIME 1.7819041443497945 %
DISTANCE 0.0 %
CARRIER_DELAY 81.51636705945558 %
WEATHER_DELAY 81.51636705945558 %
NAS_DELAY 81.51636705945558 %
SECURITY_DELAY 81.51636705945558 %
LATE_AIRCRAFT_DELAY 81.51636705945558 %


In [87]:
dataset.describe()

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
count,12888070.0,12888070.0,12695440.0,12690490.0,12690090.0,12690100.0,12684150.0,12684150.0,12888070.0,12684150.0,...,12888070.0,12888050.0,12658410.0,12658410.0,12888070.0,2382183.0,2382183.0,2382183.0,2382183.0,2382183.0
mean,2403.196,1329.811,1333.788,9.862242,17.13268,1356.915,1463.227,7.55998,1487.647,1467.708,...,0.002357918,143.7146,138.8195,114.1394,824.9544,19.65999,3.240236,15.94723,0.09264653,25.43639
std,1821.767,490.9344,504.4585,44.40754,9.684797,506.0836,533.7175,5.998619,518.6251,538.0139,...,0.04850111,75.0871,74.75932,72.58502,610.5515,60.38707,27.6987,35.89763,3.111946,48.90449
min,1.0,1.0,1.0,-234.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,-99.0,14.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0
25%,860.0,914.0,915.0,-5.0,11.0,931.0,1045.0,4.0,1101.0,1049.0,...,0.0,89.0,84.0,61.0,371.0,0.0,0.0,0.0,0.0,0.0
50%,1899.0,1321.0,1327.0,-2.0,15.0,1340.0,1504.0,6.0,1517.0,1508.0,...,0.0,125.0,120.0,95.0,651.0,0.0,0.0,3.0,0.0,3.0
75%,3772.0,1735.0,1744.0,7.0,20.0,1758.0,1912.0,9.0,1920.0,1917.0,...,0.0,175.0,170.0,144.0,1065.0,17.0,0.0,19.0,0.0,31.0
max,8402.0,2359.0,2400.0,2755.0,196.0,2400.0,2400.0,414.0,2400.0,2400.0,...,1.0,718.0,784.0,712.0,4983.0,2109.0,2692.0,1848.0,987.0,2454.0


In [88]:
delays = ["CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY" ,"LATE_AIRCRAFT_DELAY"]

In [89]:
#ARR_Delay is the total delay on arrival in minutes , only 1.8% is missing 
# on the other hand ,  [CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY] values have 81.5% NA 
# removing all records with NA values will severely shrink dataset 
# before we remove all these records , we can try to fill NA values of delay features
# we compare number of zeros of a row with total number of records without NA
for delay in delays:
    print("type of delay :",delay)
    print("# of zeros :",sum(dataset[delay] == 0.0))
    print("# of zeros percent :",sum(dataset[delay] == 0.0)/((18.5/100)*dataset.shape[0]))
    print("mean : ",dataset[delay].mean())
    print("median : ",dataset[delay].median())
    print("--------")

type of delay : CARRIER_DELAY
# of zeros : 1204800
# of zeros percent : 0.5053071521456579
mean :  19.659985399946184
--------
type of delay : WEATHER_DELAY
# of zeros : 2247720
# of zeros percent : 0.942719946896446
mean :  3.2402355318630014
--------
type of delay : NAS_DELAY
# of zeros : 1084689
# of zeros percent : 0.4549311998287861
mean :  15.947230754312326
--------
type of delay : SECURITY_DELAY
# of zeros : 2374675
# of zeros percent : 0.9959663525244772
mean :  0.09264653471206873
--------
type of delay : LATE_AIRCRAFT_DELAY
# of zeros : 1139721
# of zeros percent : 0.47801226157918436
mean :  25.4363867091655
--------


In [90]:
# we can deduce that number of zeros is dominant in these features , so filling NAs with mean,ffill,backfill,mode or interpolation are not sensible (outliers will increase mean) 
# best options are zero filling or median(which will probably be 0)
for delay in delays:
    dataset[delay]=dataset[delay].fillna(dataset[delay].median()) 

In [37]:
# drop cancellation code - 98% missing and irrelevant
dataset.drop('CANCELLATION_CODE', inplace=True, axis=1)

In [38]:
#drop flight number - not needed
dataset.drop('OP_CARRIER_FL_NUM', inplace=True, axis=1)

#drop flight date - irrelevant
dataset.drop('FL_DATE', inplace=True, axis=1)

# drop diverted - feature irrelevant to problem 
dataset.drop('DIVERTED', inplace=True, axis=1)

In [47]:
dataset.shape

19.659985399946184

In [40]:
#remove all rows having NA values
dataset1 = dataset.dropna()

In [41]:
#rows dropped from 12 millions to 2.3 millions
dataset1.shape

(2381848, 23)

In [22]:
display(dataset1.head(n=10))

Unnamed: 0,OP_CARRIER,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,...,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,AA,JFK,LAX,800,831.0,31.0,25.0,856.0,1143.0,26.0,...,0.0,402.0,398.0,347.0,2475.0,27.0,0.0,0.0,0.0,0.0
1,AA,LAX,JFK,900,934.0,34.0,34.0,1008.0,1757.0,12.0,...,0.0,327.0,335.0,289.0,2475.0,34.0,0.0,8.0,0.0,0.0
2,AA,LAX,JFK,1130,1221.0,51.0,20.0,1241.0,2025.0,15.0,...,0.0,328.0,319.0,284.0,2475.0,7.0,0.0,0.0,0.0,35.0
3,AA,DFW,HNL,1135,1252.0,77.0,19.0,1311.0,1744.0,5.0,...,0.0,517.0,537.0,513.0,3784.0,77.0,0.0,20.0,0.0,0.0
4,AA,OGG,DFW,1855,1855.0,0.0,16.0,1911.0,631.0,11.0,...,0.0,425.0,467.0,440.0,3711.0,0.0,0.0,42.0,0.0,0.0
5,AA,DFW,OGG,940,1619.0,399.0,12.0,1631.0,2031.0,6.0,...,0.0,503.0,498.0,480.0,3711.0,394.0,0.0,0.0,0.0,0.0
6,AA,HNL,DFW,1838,1903.0,25.0,19.0,1922.0,636.0,12.0,...,0.0,432.0,465.0,434.0,3784.0,25.0,0.0,33.0,0.0,0.0
15,AA,JFK,LAX,1900,1922.0,22.0,20.0,1942.0,2241.0,37.0,...,0.0,407.0,416.0,359.0,2475.0,22.0,0.0,9.0,0.0,0.0
19,AA,LAX,JFK,2200,2202.0,2.0,22.0,2224.0,557.0,55.0,...,0.0,327.0,350.0,273.0,2475.0,2.0,0.0,23.0,0.0,0.0
24,AA,JFK,LAX,700,656.0,-4.0,26.0,722.0,1025.0,57.0,...,0.0,400.0,446.0,363.0,2475.0,0.0,0.0,42.0,0.0,0.0


In [27]:
#print insights on categorical data 
categorical = ["OP_CARRIER","ORIGIN","DEST"] 
print("Number of carriers : ",len(dataset1[categorical[0]].unique()))
print("Number of departure/destination locations : ",len(dataset1[categorical[1]].unique()))

Number of carriers :  18
Number of departure/destination locations :  359
