In [1]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt # plotting
import numpy as np
import pandas as pd
from IPython.display import display
import pyarrow

In [2]:
def distribution(data, continous_feats ,transformed = False):
    """
    Visualization code for displaying skewed distributions of features
    """
    
    # Create figure
    fig = plt.figure(figsize = (11,5))

    # Skewed feature plotting
    for i, feature in enumerate(continous_feats):
        ax = fig.add_subplot(1, 4, i+1)
        ax.hist(data[feature], bins = 25, color = '#00A0A0')
        ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
        ax.set_xlabel("Value")
        ax.set_ylabel("Number of Records")
        ax.set_ylim((0, 2000))
        ax.set_yticks([0, 500, 1000, 1500, 2000])
        ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])

    # Plot aesthetics
    if transformed:
        fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \
            fontsize = 16, y = 1.03)
    else:
        fig.suptitle("Skewed Distributions of Continuous Census Data Features", \
            fontsize = 16, y = 1.03)

    fig.tight_layout()
    fig.show()

## reading data and general cleaning

In [3]:
dataset_17 = pd.read_csv("./Dataset/2017.csv")
dataset_18 = pd.read_csv("./Dataset/2018.csv")

dataset = dataset_17.append(dataset_18, ignore_index=True)


In [4]:
print("shape :",dataset.shape[0],dataset.shape[1])

shape : 12888067 28


In [5]:

display(dataset.head(n=10))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,2017-01-01,AA,1,JFK,LAX,800,831.0,31.0,25.0,856.0,...,402.0,398.0,347.0,2475.0,27.0,0.0,0.0,0.0,0.0,
1,2017-01-01,AA,2,LAX,JFK,900,934.0,34.0,34.0,1008.0,...,327.0,335.0,289.0,2475.0,34.0,0.0,8.0,0.0,0.0,
2,2017-01-01,AA,4,LAX,JFK,1130,1221.0,51.0,20.0,1241.0,...,328.0,319.0,284.0,2475.0,7.0,0.0,0.0,0.0,35.0,
3,2017-01-01,AA,5,DFW,HNL,1135,1252.0,77.0,19.0,1311.0,...,517.0,537.0,513.0,3784.0,77.0,0.0,20.0,0.0,0.0,
4,2017-01-01,AA,6,OGG,DFW,1855,1855.0,0.0,16.0,1911.0,...,425.0,467.0,440.0,3711.0,0.0,0.0,42.0,0.0,0.0,
5,2017-01-01,AA,7,DFW,OGG,940,1619.0,399.0,12.0,1631.0,...,503.0,498.0,480.0,3711.0,394.0,0.0,0.0,0.0,0.0,
6,2017-01-01,AA,8,HNL,DFW,1838,1903.0,25.0,19.0,1922.0,...,432.0,465.0,434.0,3784.0,25.0,0.0,33.0,0.0,0.0,
7,2017-01-01,AA,9,JFK,SFO,700,656.0,-4.0,22.0,718.0,...,405.0,387.0,362.0,2586.0,,,,,,
8,2017-01-01,AA,10,LAX,JFK,2100,2100.0,0.0,15.0,2115.0,...,327.0,297.0,272.0,2475.0,,,,,,
9,2017-01-01,AA,12,SFO,JFK,1135,1130.0,-5.0,27.0,1157.0,...,343.0,324.0,280.0,2586.0,,,,,,


In [6]:
#drop columns that all have NA values
dataset.dropna(axis='columns',how="all",inplace=True)


In [7]:
display(dataset.head(n=10))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2017-01-01,AA,1,JFK,LAX,800,831.0,31.0,25.0,856.0,...,0.0,402.0,398.0,347.0,2475.0,27.0,0.0,0.0,0.0,0.0
1,2017-01-01,AA,2,LAX,JFK,900,934.0,34.0,34.0,1008.0,...,0.0,327.0,335.0,289.0,2475.0,34.0,0.0,8.0,0.0,0.0
2,2017-01-01,AA,4,LAX,JFK,1130,1221.0,51.0,20.0,1241.0,...,0.0,328.0,319.0,284.0,2475.0,7.0,0.0,0.0,0.0,35.0
3,2017-01-01,AA,5,DFW,HNL,1135,1252.0,77.0,19.0,1311.0,...,0.0,517.0,537.0,513.0,3784.0,77.0,0.0,20.0,0.0,0.0
4,2017-01-01,AA,6,OGG,DFW,1855,1855.0,0.0,16.0,1911.0,...,0.0,425.0,467.0,440.0,3711.0,0.0,0.0,42.0,0.0,0.0
5,2017-01-01,AA,7,DFW,OGG,940,1619.0,399.0,12.0,1631.0,...,0.0,503.0,498.0,480.0,3711.0,394.0,0.0,0.0,0.0,0.0
6,2017-01-01,AA,8,HNL,DFW,1838,1903.0,25.0,19.0,1922.0,...,0.0,432.0,465.0,434.0,3784.0,25.0,0.0,33.0,0.0,0.0
7,2017-01-01,AA,9,JFK,SFO,700,656.0,-4.0,22.0,718.0,...,0.0,405.0,387.0,362.0,2586.0,,,,,
8,2017-01-01,AA,10,LAX,JFK,2100,2100.0,0.0,15.0,2115.0,...,0.0,327.0,297.0,272.0,2475.0,,,,,
9,2017-01-01,AA,12,SFO,JFK,1135,1130.0,-5.0,27.0,1157.0,...,0.0,343.0,324.0,280.0,2586.0,,,,,


In [8]:
#print columns of dataset and their datatype
print(dataset.dtypes)

FL_DATE                 object
OP_CARRIER              object
OP_CARRIER_FL_NUM        int64
ORIGIN                  object
DEST                    object
CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY              float64
TAXI_OUT               float64
WHEELS_OFF             float64
WHEELS_ON              float64
TAXI_IN                float64
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY              float64
CANCELLED              float64
CANCELLATION_CODE       object
DIVERTED               float64
CRS_ELAPSED_TIME       float64
ACTUAL_ELAPSED_TIME    float64
AIR_TIME               float64
DISTANCE               float64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
dtype: object


In [9]:
#print percantage of NA values for each feature
for col in dataset.columns:
   print(col,(sum(pd.isna(dataset[col])/dataset.shape[0])*100),"%")

FL_DATE 0.0 %
OP_CARRIER 0.0 %
OP_CARRIER_FL_NUM 0.0 %
ORIGIN 0.0 %
DEST 0.0 %
CRS_DEP_TIME 0.0 %
DEP_TIME 1.4945996168338376 %
DEP_DELAY 1.5330227566262846 %
TAXI_OUT 1.5361108845821396 %
WHEELS_OFF 1.5360720890048047 %
WHEELS_ON 1.5822388260337314 %
TAXI_IN 1.5822388260337314 %
CRS_ARR_TIME 0.0 %
ARR_TIME 1.5822310669182642 %
ARR_DELAY 1.8020623263334392 %
CANCELLED 0.0 %
CANCELLATION_CODE 98.45378677874118 %
DIVERTED 0.0 %
CRS_ELAPSED_TIME 0.0001319049629397489 %
ACTUAL_ELAPSED_TIME 1.7819041443497945 %
AIR_TIME 1.7819041443497945 %
DISTANCE 0.0 %
CARRIER_DELAY 81.51636705945558 %
WEATHER_DELAY 81.51636705945558 %
NAS_DELAY 81.51636705945558 %
SECURITY_DELAY 81.51636705945558 %
LATE_AIRCRAFT_DELAY 81.51636705945558 %


In [10]:
dataset.describe()

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
count,12888070.0,12888070.0,12695440.0,12690490.0,12690090.0,12690100.0,12684150.0,12684150.0,12888070.0,12684150.0,...,12888070.0,12888050.0,12658410.0,12658410.0,12888070.0,2382183.0,2382183.0,2382183.0,2382183.0,2382183.0
mean,2403.196,1329.811,1333.788,9.862242,17.13268,1356.915,1463.227,7.55998,1487.647,1467.708,...,0.002357918,143.7146,138.8195,114.1394,824.9544,19.65999,3.240236,15.94723,0.09264653,25.43639
std,1821.767,490.9344,504.4585,44.40754,9.684797,506.0836,533.7175,5.998619,518.6251,538.0139,...,0.04850111,75.0871,74.75932,72.58502,610.5515,60.38707,27.6987,35.89763,3.111946,48.90449
min,1.0,1.0,1.0,-234.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,-99.0,14.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0
25%,860.0,914.0,915.0,-5.0,11.0,931.0,1045.0,4.0,1101.0,1049.0,...,0.0,89.0,84.0,61.0,371.0,0.0,0.0,0.0,0.0,0.0
50%,1899.0,1321.0,1327.0,-2.0,15.0,1340.0,1504.0,6.0,1517.0,1508.0,...,0.0,125.0,120.0,95.0,651.0,0.0,0.0,3.0,0.0,3.0
75%,3772.0,1735.0,1744.0,7.0,20.0,1758.0,1912.0,9.0,1920.0,1917.0,...,0.0,175.0,170.0,144.0,1065.0,17.0,0.0,19.0,0.0,31.0
max,8402.0,2359.0,2400.0,2755.0,196.0,2400.0,2400.0,414.0,2400.0,2400.0,...,1.0,718.0,784.0,712.0,4983.0,2109.0,2692.0,1848.0,987.0,2454.0


In [11]:
delays = ["CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY" ,"LATE_AIRCRAFT_DELAY"]

In [12]:
# ARR_Delay is the total delay on arrival in minutes , only 1.8% is missing 
# on the other hand ,  [CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY] values have 81.5% NA 
# removing all records with NA values will severely shrink dataset 
# before we remove all these records , we can try to fill NA values of delay features
# we compare number of zeros of a row with total number of records without NA
for delay in delays:
    print("type of delay :",delay)
    print("# of zeros :",sum(dataset[delay] == 0.0))
    print("# of zeros percent :",sum(dataset[delay] == 0.0)/((18.5/100)*dataset.shape[0]))
    print("mean : ",dataset[delay].mean())
    print("median : ",dataset[delay].median())
    print("--------")

type of delay : CARRIER_DELAY
# of zeros : 1204800
# of zeros percent : 0.5053071521456579
mean :  19.659985399946184
median :  0.0
--------
type of delay : WEATHER_DELAY
# of zeros : 2247720
# of zeros percent : 0.942719946896446
mean :  3.2402355318630014
median :  0.0
--------
type of delay : NAS_DELAY
# of zeros : 1084689
# of zeros percent : 0.4549311998287861
mean :  15.947230754312326
median :  3.0
--------
type of delay : SECURITY_DELAY
# of zeros : 2374675
# of zeros percent : 0.9959663525244772
mean :  0.09264653471206873
median :  0.0
--------
type of delay : LATE_AIRCRAFT_DELAY
# of zeros : 1139721
# of zeros percent : 0.47801226157918436
mean :  25.4363867091655
median :  3.0
--------


In [13]:
# we can deduce that number of zeros is dominant in these features , so filling NAs with mean,ffill,backfill,mode or interpolation are not sensible (outliers will increase mean) 
# best options are zero filling or median(which will probably be 0)
for delay in delays:
    dataset[delay]=dataset[delay].fillna(dataset[delay].median()) 

In [14]:
display(dataset.head(n=10))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2017-01-01,AA,1,JFK,LAX,800,831.0,31.0,25.0,856.0,...,0.0,402.0,398.0,347.0,2475.0,27.0,0.0,0.0,0.0,0.0
1,2017-01-01,AA,2,LAX,JFK,900,934.0,34.0,34.0,1008.0,...,0.0,327.0,335.0,289.0,2475.0,34.0,0.0,8.0,0.0,0.0
2,2017-01-01,AA,4,LAX,JFK,1130,1221.0,51.0,20.0,1241.0,...,0.0,328.0,319.0,284.0,2475.0,7.0,0.0,0.0,0.0,35.0
3,2017-01-01,AA,5,DFW,HNL,1135,1252.0,77.0,19.0,1311.0,...,0.0,517.0,537.0,513.0,3784.0,77.0,0.0,20.0,0.0,0.0
4,2017-01-01,AA,6,OGG,DFW,1855,1855.0,0.0,16.0,1911.0,...,0.0,425.0,467.0,440.0,3711.0,0.0,0.0,42.0,0.0,0.0
5,2017-01-01,AA,7,DFW,OGG,940,1619.0,399.0,12.0,1631.0,...,0.0,503.0,498.0,480.0,3711.0,394.0,0.0,0.0,0.0,0.0
6,2017-01-01,AA,8,HNL,DFW,1838,1903.0,25.0,19.0,1922.0,...,0.0,432.0,465.0,434.0,3784.0,25.0,0.0,33.0,0.0,0.0
7,2017-01-01,AA,9,JFK,SFO,700,656.0,-4.0,22.0,718.0,...,0.0,405.0,387.0,362.0,2586.0,0.0,0.0,3.0,0.0,3.0
8,2017-01-01,AA,10,LAX,JFK,2100,2100.0,0.0,15.0,2115.0,...,0.0,327.0,297.0,272.0,2475.0,0.0,0.0,3.0,0.0,3.0
9,2017-01-01,AA,12,SFO,JFK,1135,1130.0,-5.0,27.0,1157.0,...,0.0,343.0,324.0,280.0,2586.0,0.0,0.0,3.0,0.0,3.0


In [15]:
# drop cancellation code - 98% missing and irrelevant
dataset.drop('CANCELLATION_CODE', inplace=True, axis=1)


In [16]:
#drop NAs
dataset_visualization = dataset.dropna() 

In [17]:
#removed around 1.8% of data only
dataset_visualization.shape

(12651227, 26)

In [18]:
print("carriers : ",dataset_visualization["OP_CARRIER"].unique())

carriers :  ['AA' 'B6' 'EV' 'HA' 'NK' 'OO' 'UA' 'VX' 'AS' 'WN' 'DL' 'F9' '9E' 'G4'
 'MQ' 'OH' 'YV' 'YX']


In [19]:
#after inspection of op_carriers feature levels , we figured actual carrier names 
dataset_visualization['OP_CARRIER'].replace({
    'UA':'United Airlines',
    'AS':'Alaska Airlines',
    '9E':'Endeavor Air',
    'B6':'JetBlue Airways',
    'EV':'ExpressJet',
    'F9':'Frontier Airlines',
    'G4':'Allegiant Air',
    'HA':'Hawaiian Airlines',
    'MQ':'Envoy Air',
    'NK':'Spirit Airlines',
    'OH':'PSA Airlines',
    'OO':'SkyWest Airlines',
    'VX':'Virgin America',
    'WN':'Southwest Airlines',
    'YV':'Mesa Airline',
    'YX':'Republic Airways',
    'AA':'American Airlines',
    'DL':'Delta Airlines'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [20]:
print("carriers : ",dataset_visualization["OP_CARRIER"].unique())

carriers :  ['American Airlines' 'JetBlue Airways' 'ExpressJet' 'Hawaiian Airlines'
 'Spirit Airlines' 'SkyWest Airlines' 'United Airlines' 'Virgin America'
 'Alaska Airlines' 'Southwest Airlines' 'Delta Airlines'
 'Frontier Airlines' 'Endeavor Air' 'Allegiant Air' 'Envoy Air'
 'PSA Airlines' 'Mesa Airline' 'Republic Airways']


In [21]:
#seperate combined date format into datetime object 
dataset_visualization["FL_DATE"] = pd.to_datetime(dataset_visualization.FL_DATE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization["FL_DATE"] = pd.to_datetime(dataset_visualization.FL_DATE)


In [22]:
dataset_visualization.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12651227 entries, 0 to 12888066
Data columns (total 26 columns):
 #   Column               Dtype         
---  ------               -----         
 0   FL_DATE              datetime64[ns]
 1   OP_CARRIER           object        
 2   OP_CARRIER_FL_NUM    int64         
 3   ORIGIN               object        
 4   DEST                 object        
 5   CRS_DEP_TIME         int64         
 6   DEP_TIME             float64       
 7   DEP_DELAY            float64       
 8   TAXI_OUT             float64       
 9   WHEELS_OFF           float64       
 10  WHEELS_ON            float64       
 11  TAXI_IN              float64       
 12  CRS_ARR_TIME         int64         
 13  ARR_TIME             float64       
 14  ARR_DELAY            float64       
 15  CANCELLED            float64       
 16  DIVERTED             float64       
 17  CRS_ELAPSED_TIME     float64       
 18  ACTUAL_ELAPSED_TIME  float64       
 19  AIR_TIME           

In [23]:
dataset_visualization['Month'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).month
dataset_visualization['Day'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).day
dataset_visualization['Year'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization['Month'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization['Day'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization['Year']

In [24]:
display(dataset_visualization.tail(n=1))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Month,Day,Year
12888066,2018-12-31,American Airlines,1818,RDU,CLT,1435,1443.0,8.0,8.0,1451.0,...,44.0,130.0,0.0,0.0,3.0,0.0,3.0,12,31,2018


In [25]:
#write the dataset for visualization using parquet dataformat , fast and low memory usage
dataset_visualization.to_parquet("./Dataset/dataset_visualization.parquet",index=False)

## After preparing a dataset for visualization , prepare the data for ML 

In [26]:
#drop flight number - not needed
dataset_visualization.drop('OP_CARRIER_FL_NUM', inplace=True, axis=1)

#drop flight date - irrelevant
dataset_visualization.drop('FL_DATE', inplace=True, axis=1)

# drop diverted - feature irrelevant to problem 
dataset_visualization.drop('DIVERTED', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [27]:
dataset_visualization.shape

(12651227, 26)

In [28]:
#remove all rows having NA values
dataset_ML = dataset_visualization.dropna()

In [29]:
dataset_ML.shape

(12651227, 26)

In [30]:
display(dataset_ML.head(n=10))

Unnamed: 0,OP_CARRIER,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,...,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Month,Day,Year
0,American Airlines,JFK,LAX,800,831.0,31.0,25.0,856.0,1143.0,26.0,...,347.0,2475.0,27.0,0.0,0.0,0.0,0.0,1,1,2017
1,American Airlines,LAX,JFK,900,934.0,34.0,34.0,1008.0,1757.0,12.0,...,289.0,2475.0,34.0,0.0,8.0,0.0,0.0,1,1,2017
2,American Airlines,LAX,JFK,1130,1221.0,51.0,20.0,1241.0,2025.0,15.0,...,284.0,2475.0,7.0,0.0,0.0,0.0,35.0,1,1,2017
3,American Airlines,DFW,HNL,1135,1252.0,77.0,19.0,1311.0,1744.0,5.0,...,513.0,3784.0,77.0,0.0,20.0,0.0,0.0,1,1,2017
4,American Airlines,OGG,DFW,1855,1855.0,0.0,16.0,1911.0,631.0,11.0,...,440.0,3711.0,0.0,0.0,42.0,0.0,0.0,1,1,2017
5,American Airlines,DFW,OGG,940,1619.0,399.0,12.0,1631.0,2031.0,6.0,...,480.0,3711.0,394.0,0.0,0.0,0.0,0.0,1,1,2017
6,American Airlines,HNL,DFW,1838,1903.0,25.0,19.0,1922.0,636.0,12.0,...,434.0,3784.0,25.0,0.0,33.0,0.0,0.0,1,1,2017
7,American Airlines,JFK,SFO,700,656.0,-4.0,22.0,718.0,1020.0,3.0,...,362.0,2586.0,0.0,0.0,3.0,0.0,3.0,1,1,2017
8,American Airlines,LAX,JFK,2100,2100.0,0.0,15.0,2115.0,447.0,10.0,...,272.0,2475.0,0.0,0.0,3.0,0.0,3.0,1,1,2017
9,American Airlines,SFO,JFK,1135,1130.0,-5.0,27.0,1157.0,1937.0,17.0,...,280.0,2586.0,0.0,0.0,3.0,0.0,3.0,1,1,2017


In [31]:
#print insights on number of origin and dest 
categorical = ["ORIGIN","DEST"] 
print("Number of departure locations : ",len(dataset_ML[categorical[0]].unique()))
print("Number of destination locations : ",len(dataset_ML[categorical[1]].unique()))

Number of departure locations :  359
Number of destination locations :  360


### Looking at the ORIGIN and DEST features , they have huge number of unique values , this means that :
- using hot encoding for converting to numerical features will yield huge number of features which will lead to the curse of dimensionality 
- using label encoding will yield to only one feature , but the feature values will have great std deviation and labels will have different priority
- so it would be better if we drop both

### for the op_carrier :
- different carriers may differ in plane services but irrelevant to our problem so it will be removed

### FL_Date is only used for visualization so it will be removed , same for month day and year features
- we could have made use of month feature because of its relation to a season in a year , but the data is only based on domestic flights of the US so it will biased to seasonality in the US only , model needs to generalize regardless

In [32]:
dataset_ML=dataset_ML.drop('OP_CARRIER', axis=1)

In [33]:
dataset_ML=dataset_ML.drop('ORIGIN',axis=1)

In [34]:
dataset_ML=dataset_ML.drop('DEST',axis=1)

In [35]:
dataset_ML=dataset_ML.drop('Month',axis=1)

In [36]:
dataset_ML=dataset_ML.drop('Day',axis=1)

In [37]:
dataset_ML=dataset_ML.drop('Year',axis=1)

In [38]:
display(dataset_ML.head(n=5))

Unnamed: 0,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,800,831.0,31.0,25.0,856.0,1143.0,26.0,1142,1209.0,27.0,0.0,402.0,398.0,347.0,2475.0,27.0,0.0,0.0,0.0,0.0
1,900,934.0,34.0,34.0,1008.0,1757.0,12.0,1727,1809.0,42.0,0.0,327.0,335.0,289.0,2475.0,34.0,0.0,8.0,0.0,0.0
2,1130,1221.0,51.0,20.0,1241.0,2025.0,15.0,1958,2040.0,42.0,0.0,328.0,319.0,284.0,2475.0,7.0,0.0,0.0,0.0,35.0
3,1135,1252.0,77.0,19.0,1311.0,1744.0,5.0,1612,1749.0,97.0,0.0,517.0,537.0,513.0,3784.0,77.0,0.0,20.0,0.0,0.0
4,1855,1855.0,0.0,16.0,1911.0,631.0,11.0,600,642.0,42.0,0.0,425.0,467.0,440.0,3711.0,0.0,0.0,42.0,0.0,0.0


In [39]:
#write the dataset for visualization using parquet dataformat , fast and low memory usage
dataset_ML.to_parquet("./Dataset/dataset_ML.parquet",index=False)