In [1]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt # plotting
import numpy as np
import pandas as pd
from IPython.display import display
import pyarrow
from random import randint

In [2]:
def distribution(data, continous_feats ,transformed = False):
    """
    Visualization code for displaying skewed distributions of features
    """
    
    # Create figure
    fig = plt.figure(figsize = (11,5))

    # Skewed feature plotting
    for i, feature in enumerate(continous_feats):
        ax = fig.add_subplot(1, 4, i+1)
        ax.hist(data[feature], bins = 25, color = '#00A0A0')
        ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
        ax.set_xlabel("Value")
        ax.set_ylabel("Number of Records")
        ax.set_ylim((0, 2000))
        ax.set_yticks([0, 500, 1000, 1500, 2000])
        ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])

    # Plot aesthetics
    if transformed:
        fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \
            fontsize = 16, y = 1.03)
    else:
        fig.suptitle("Skewed Distributions of Continuous Census Data Features", \
            fontsize = 16, y = 1.03)

    fig.tight_layout()
    fig.show()

## reading data and general cleaning

In [3]:
dataset_17 = pd.read_csv("./Dataset/2017.csv")
dataset_18 = pd.read_csv("./Dataset/2018.csv")

length_needed = 100000
value = randint(0, dataset_17.shape[0]-length_needed)
print(value)
print(dataset_17.shape)
dataset_17 = dataset_17.iloc[value:value+length_needed,:]
print(dataset_17.shape)

####
value = randint(0, dataset_18.shape[0]-length_needed)
print(value)
print(dataset_18.shape)
dataset_18 = dataset_18.iloc[value:value+length_needed,:]
print(dataset_18.shape)




2941057
(5674621, 28)
(100000, 28)
4420199
(7213446, 28)
(100000, 28)


In [4]:
dataset = dataset_17.append(dataset_18, ignore_index=True)

  dataset = dataset_17.append(dataset_18, ignore_index=True)


In [5]:
print("shape :",dataset.shape[0],dataset.shape[1])

shape : 200000 28


In [6]:

display(dataset.head(n=10))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
0,2017-07-10,AA,1289,MCO,DFW,730,724.0,-6.0,14.0,738.0,...,166.0,174.0,137.0,985.0,,,,,,
1,2017-07-10,AA,1291,MKE,DFW,700,655.0,-5.0,11.0,706.0,...,152.0,154.0,124.0,853.0,,,,,,
2,2017-07-10,AA,1292,DFW,TUL,1245,1240.0,-5.0,12.0,1252.0,...,64.0,56.0,42.0,237.0,,,,,,
3,2017-07-10,AA,1292,TUL,DFW,1430,1429.0,-1.0,20.0,1449.0,...,73.0,68.0,38.0,237.0,,,,,,
4,2017-07-10,AA,1293,MIA,STX,1205,1208.0,3.0,23.0,1231.0,...,165.0,168.0,142.0,1139.0,,,,,,
5,2017-07-10,AA,1293,STX,MIA,1539,1530.0,-9.0,21.0,1551.0,...,184.0,176.0,146.0,1139.0,,,,,,
6,2017-07-10,AA,1294,DFW,JFK,1215,1224.0,9.0,18.0,1242.0,...,227.0,209.0,184.0,1391.0,,,,,,
7,2017-07-10,AA,1295,ORD,SJC,1715,1713.0,-2.0,25.0,1738.0,...,272.0,280.0,240.0,1829.0,,,,,,
8,2017-07-10,AA,1296,ORD,DFW,2140,2301.0,81.0,25.0,2326.0,...,135.0,141.0,108.0,802.0,10.0,0.0,6.0,0.0,71.0,
9,2017-07-10,AA,1296,SAN,ORD,1420,,,,,...,250.0,,,1723.0,,,,,,


In [7]:
#drop columns that all have NA values
dataset.dropna(axis='columns',how="all",inplace=True)


In [8]:
display(dataset.head(n=10))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2017-07-10,AA,1289,MCO,DFW,730,724.0,-6.0,14.0,738.0,...,0.0,166.0,174.0,137.0,985.0,,,,,
1,2017-07-10,AA,1291,MKE,DFW,700,655.0,-5.0,11.0,706.0,...,0.0,152.0,154.0,124.0,853.0,,,,,
2,2017-07-10,AA,1292,DFW,TUL,1245,1240.0,-5.0,12.0,1252.0,...,0.0,64.0,56.0,42.0,237.0,,,,,
3,2017-07-10,AA,1292,TUL,DFW,1430,1429.0,-1.0,20.0,1449.0,...,0.0,73.0,68.0,38.0,237.0,,,,,
4,2017-07-10,AA,1293,MIA,STX,1205,1208.0,3.0,23.0,1231.0,...,0.0,165.0,168.0,142.0,1139.0,,,,,
5,2017-07-10,AA,1293,STX,MIA,1539,1530.0,-9.0,21.0,1551.0,...,0.0,184.0,176.0,146.0,1139.0,,,,,
6,2017-07-10,AA,1294,DFW,JFK,1215,1224.0,9.0,18.0,1242.0,...,0.0,227.0,209.0,184.0,1391.0,,,,,
7,2017-07-10,AA,1295,ORD,SJC,1715,1713.0,-2.0,25.0,1738.0,...,0.0,272.0,280.0,240.0,1829.0,,,,,
8,2017-07-10,AA,1296,ORD,DFW,2140,2301.0,81.0,25.0,2326.0,...,0.0,135.0,141.0,108.0,802.0,10.0,0.0,6.0,0.0,71.0
9,2017-07-10,AA,1296,SAN,ORD,1420,,,,,...,0.0,250.0,,,1723.0,,,,,


In [9]:
#print columns of dataset and their datatype
print(dataset.dtypes)

FL_DATE                 object
OP_CARRIER              object
OP_CARRIER_FL_NUM        int64
ORIGIN                  object
DEST                    object
CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY              float64
TAXI_OUT               float64
WHEELS_OFF             float64
WHEELS_ON              float64
TAXI_IN                float64
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY              float64
CANCELLED              float64
CANCELLATION_CODE       object
DIVERTED               float64
CRS_ELAPSED_TIME       float64
ACTUAL_ELAPSED_TIME    float64
AIR_TIME               float64
DISTANCE               float64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
dtype: object


In [10]:
#print percantage of NA values for each feature
for col in dataset.columns:
   print(col,(sum(pd.isna(dataset[col])/dataset.shape[0])*100),"%")

FL_DATE 0.0 %
OP_CARRIER 0.0 %
OP_CARRIER_FL_NUM 0.0 %
ORIGIN 0.0 %
DEST 0.0 %
CRS_DEP_TIME 0.0 %
DEP_TIME 2.379500000000195 %
DEP_DELAY 2.42800000000021 %
TAXI_OUT 2.474000000000224 %
WHEELS_OFF 2.473500000000224 %
WHEELS_ON 2.5150000000002364 %
TAXI_IN 2.5150000000002364 %
CRS_ARR_TIME 0.0 %
ARR_TIME 2.5150000000002364 %
ARR_DELAY 2.8490000000003386 %
CANCELLED 0.0 %
CANCELLATION_CODE 97.5190000002155 %
DIVERTED 0.0 %
CRS_ELAPSED_TIME 0.0 %
ACTUAL_ELAPSED_TIME 2.830000000000333 %
AIR_TIME 2.830000000000333 %
DISTANCE 0.0 %
CARRIER_DELAY 73.90850000006081 %
WEATHER_DELAY 73.90850000006081 %
NAS_DELAY 73.90850000006081 %
SECURITY_DELAY 73.90850000006081 %
LATE_AIRCRAFT_DELAY 73.90850000006081 %


In [11]:
dataset.describe()

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
count,200000.0,200000.0,195241.0,195144.0,195052.0,195053.0,194970.0,194970.0,200000.0,194970.0,...,200000.0,200000.0,194340.0,194340.0,200000.0,52183.0,52183.0,52183.0,52183.0,52183.0
mean,2312.74056,1332.71919,1339.812749,19.064281,17.816295,1361.632192,1438.93041,8.023414,1481.438425,1441.921255,...,0.003495,144.25196,140.660754,114.838711,839.019995,19.809804,3.649407,21.711074,0.056053,31.872296
std,1773.135527,497.800634,521.595534,57.959852,11.056775,525.202286,562.030806,6.85807,528.155201,567.23316,...,0.059015,75.799108,76.185611,73.117224,625.069927,60.090252,25.608988,49.835107,1.584287,56.411885
min,1.0,1.0,1.0,-48.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,18.0,15.0,8.0,31.0,0.0,0.0,0.0,0.0,0.0
25%,815.0,910.0,911.0,-4.0,11.0,927.0,1025.0,4.0,1052.0,1029.0,...,0.0,89.0,84.0,61.0,373.0,0.0,0.0,0.0,0.0,0.0
50%,1876.0,1325.0,1328.0,-1.0,15.0,1341.0,1449.0,6.0,1514.0,1452.0,...,0.0,125.0,121.0,95.0,664.0,0.0,0.0,3.0,0.0,8.0
75%,3540.0,1741.0,1756.0,16.0,21.0,1812.0,1913.0,9.0,1921.0,1917.0,...,0.0,175.0,173.0,145.0,1072.0,17.0,0.0,22.0,0.0,41.0
max,7439.0,2359.0,2400.0,1400.0,176.0,2400.0,2400.0,165.0,2359.0,2400.0,...,1.0,656.0,664.0,633.0,4983.0,1393.0,1074.0,1276.0,137.0,1146.0


In [12]:
delays = ["CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY" ,"LATE_AIRCRAFT_DELAY"]

In [13]:
# ARR_Delay is the total delay on arrival in minutes , only 1.8% is missing 
# on the other hand ,  [CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY] values have 81.5% NA 
# removing all records with NA values will severely shrink dataset 
# before we remove all these records , we can try to fill NA values of delay features
# we compare number of zeros of a row with total number of records without NA
for delay in delays:
    print("type of delay :",delay)
    print("# of zeros :",sum(dataset[delay] == 0.0))
    print("# of zeros percent :",sum(dataset[delay] == 0.0)/((18.5/100)*dataset.shape[0]))
    print("mean : ",dataset[delay].mean())
    print("median : ",dataset[delay].median())
    print("--------")

type of delay : CARRIER_DELAY
# of zeros : 27281
# of zeros percent : 0.7373243243243244
mean :  19.809803959143782
median :  0.0
--------
type of delay : WEATHER_DELAY
# of zeros : 48736
# of zeros percent : 1.3171891891891891
mean :  3.6494068949657934
median :  0.0
--------
type of delay : NAS_DELAY
# of zeros : 23677
# of zeros percent : 0.6399189189189189
mean :  21.71107448786003
median :  3.0
--------
type of delay : SECURITY_DELAY
# of zeros : 52058
# of zeros percent : 1.406972972972973
mean :  0.056052737481555294
median :  0.0
--------
type of delay : LATE_AIRCRAFT_DELAY
# of zeros : 23128
# of zeros percent : 0.6250810810810811
mean :  31.872295575187323
median :  8.0
--------


In [14]:
# we can deduce that number of zeros is dominant in these features , so filling NAs with mean,ffill,backfill,mode or interpolation are not sensible (outliers will increase mean) 
# best options are zero filling or median(which will probably be 0)
for delay in delays:
    dataset[delay]=dataset[delay].fillna(dataset[delay].median()) 

In [15]:
display(dataset.head(n=10))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2017-07-10,AA,1289,MCO,DFW,730,724.0,-6.0,14.0,738.0,...,0.0,166.0,174.0,137.0,985.0,0.0,0.0,3.0,0.0,8.0
1,2017-07-10,AA,1291,MKE,DFW,700,655.0,-5.0,11.0,706.0,...,0.0,152.0,154.0,124.0,853.0,0.0,0.0,3.0,0.0,8.0
2,2017-07-10,AA,1292,DFW,TUL,1245,1240.0,-5.0,12.0,1252.0,...,0.0,64.0,56.0,42.0,237.0,0.0,0.0,3.0,0.0,8.0
3,2017-07-10,AA,1292,TUL,DFW,1430,1429.0,-1.0,20.0,1449.0,...,0.0,73.0,68.0,38.0,237.0,0.0,0.0,3.0,0.0,8.0
4,2017-07-10,AA,1293,MIA,STX,1205,1208.0,3.0,23.0,1231.0,...,0.0,165.0,168.0,142.0,1139.0,0.0,0.0,3.0,0.0,8.0
5,2017-07-10,AA,1293,STX,MIA,1539,1530.0,-9.0,21.0,1551.0,...,0.0,184.0,176.0,146.0,1139.0,0.0,0.0,3.0,0.0,8.0
6,2017-07-10,AA,1294,DFW,JFK,1215,1224.0,9.0,18.0,1242.0,...,0.0,227.0,209.0,184.0,1391.0,0.0,0.0,3.0,0.0,8.0
7,2017-07-10,AA,1295,ORD,SJC,1715,1713.0,-2.0,25.0,1738.0,...,0.0,272.0,280.0,240.0,1829.0,0.0,0.0,3.0,0.0,8.0
8,2017-07-10,AA,1296,ORD,DFW,2140,2301.0,81.0,25.0,2326.0,...,0.0,135.0,141.0,108.0,802.0,10.0,0.0,6.0,0.0,71.0
9,2017-07-10,AA,1296,SAN,ORD,1420,,,,,...,0.0,250.0,,,1723.0,0.0,0.0,3.0,0.0,8.0


In [16]:
# drop cancellation code - 98% missing and irrelevant
dataset.drop('CANCELLATION_CODE', inplace=True, axis=1)


In [17]:
#drop NAs
dataset_visualization = dataset.dropna() 

In [18]:
#removed around 1.8% of data only
dataset_visualization.shape

(194225, 26)

In [19]:
print("carriers : ",dataset_visualization["OP_CARRIER"].unique())

carriers :  ['AA' 'DL' 'EV' 'NK' 'F9' 'HA' 'VX' 'B6' 'UA' 'WN' 'AS' 'OO' 'G4' 'MQ'
 'OH' 'YV' 'YX' '9E']


In [20]:
#after inspection of op_carriers feature levels , we figured actual carrier names 
dataset_visualization['OP_CARRIER'].replace({
    'UA':'United Airlines',
    'AS':'Alaska Airlines',
    '9E':'Endeavor Air',
    'B6':'JetBlue Airways',
    'EV':'ExpressJet',
    'F9':'Frontier Airlines',
    'G4':'Allegiant Air',
    'HA':'Hawaiian Airlines',
    'MQ':'Envoy Air',
    'NK':'Spirit Airlines',
    'OH':'PSA Airlines',
    'OO':'SkyWest Airlines',
    'VX':'Virgin America',
    'WN':'Southwest Airlines',
    'YV':'Mesa Airline',
    'YX':'Republic Airways',
    'AA':'American Airlines',
    'DL':'Delta Airlines'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization['OP_CARRIER'].replace({


In [21]:
print("carriers : ",dataset_visualization["OP_CARRIER"].unique())

carriers :  ['American Airlines' 'Delta Airlines' 'ExpressJet' 'Spirit Airlines'
 'Frontier Airlines' 'Hawaiian Airlines' 'Virgin America'
 'JetBlue Airways' 'United Airlines' 'Southwest Airlines'
 'Alaska Airlines' 'SkyWest Airlines' 'Allegiant Air' 'Envoy Air'
 'PSA Airlines' 'Mesa Airline' 'Republic Airways' 'Endeavor Air']


In [22]:
#seperate combined date format into datetime object 
dataset_visualization["FL_DATE"] = pd.to_datetime(dataset_visualization.FL_DATE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization["FL_DATE"] = pd.to_datetime(dataset_visualization.FL_DATE)


In [23]:
dataset_visualization.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 194225 entries, 0 to 199999
Data columns (total 26 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   FL_DATE              194225 non-null  datetime64[ns]
 1   OP_CARRIER           194225 non-null  object        
 2   OP_CARRIER_FL_NUM    194225 non-null  int64         
 3   ORIGIN               194225 non-null  object        
 4   DEST                 194225 non-null  object        
 5   CRS_DEP_TIME         194225 non-null  int64         
 6   DEP_TIME             194225 non-null  float64       
 7   DEP_DELAY            194225 non-null  float64       
 8   TAXI_OUT             194225 non-null  float64       
 9   WHEELS_OFF           194225 non-null  float64       
 10  WHEELS_ON            194225 non-null  float64       
 11  TAXI_IN              194225 non-null  float64       
 12  CRS_ARR_TIME         194225 non-null  int64         
 13  ARR_TIME      

In [24]:
dataset_visualization['Month'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).month
dataset_visualization['Day'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).day
dataset_visualization['Year'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization['Month'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization['Day'] = pd.DatetimeIndex(dataset_visualization['FL_DATE']).day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization['Year']

In [25]:
display(dataset_visualization.tail(n=1))

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,...,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Month,Day,Year
199999,2018-08-16,American Airlines,883,CLT,PHX,1750,1840.0,50.0,16.0,1856.0,...,226.0,1773.0,40.0,0.0,0.0,0.0,0.0,8,16,2018


In [26]:
#write the dataset for visualization using parquet dataformat , fast and low memory usage
dataset_visualization.to_parquet("./Dataset/dataset_visualization.parquet",index=False)

## After preparing a dataset for visualization , prepare the data for ML 

In [27]:
#drop flight number - not needed
dataset_visualization.drop('OP_CARRIER_FL_NUM', inplace=True, axis=1)

#drop flight date - irrelevant
dataset_visualization.drop('FL_DATE', inplace=True, axis=1)

# drop diverted - feature irrelevant to problem 
dataset_visualization.drop('DIVERTED', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization.drop('OP_CARRIER_FL_NUM', inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization.drop('FL_DATE', inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_visualization.drop('DIVERTED', inplace=True, axis=1)


In [28]:
dataset_visualization.shape

(194225, 26)

In [29]:
#remove all rows having NA values
dataset_ML = dataset_visualization.dropna()

In [30]:
dataset_ML.shape

(194225, 26)

In [31]:
display(dataset_ML.head(n=10))

Unnamed: 0,OP_CARRIER,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,...,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Month,Day,Year
0,American Airlines,MCO,DFW,730,724.0,-6.0,14.0,738.0,855.0,23.0,...,137.0,985.0,0.0,0.0,3.0,0.0,8.0,7,10,2017
1,American Airlines,MKE,DFW,700,655.0,-5.0,11.0,706.0,910.0,19.0,...,124.0,853.0,0.0,0.0,3.0,0.0,8.0,7,10,2017
2,American Airlines,DFW,TUL,1245,1240.0,-5.0,12.0,1252.0,1334.0,2.0,...,42.0,237.0,0.0,0.0,3.0,0.0,8.0,7,10,2017
3,American Airlines,TUL,DFW,1430,1429.0,-1.0,20.0,1449.0,1527.0,10.0,...,38.0,237.0,0.0,0.0,3.0,0.0,8.0,7,10,2017
4,American Airlines,MIA,STX,1205,1208.0,3.0,23.0,1231.0,1453.0,3.0,...,142.0,1139.0,0.0,0.0,3.0,0.0,8.0,7,10,2017
5,American Airlines,STX,MIA,1539,1530.0,-9.0,21.0,1551.0,1817.0,9.0,...,146.0,1139.0,0.0,0.0,3.0,0.0,8.0,7,10,2017
6,American Airlines,DFW,JFK,1215,1224.0,9.0,18.0,1242.0,1646.0,7.0,...,184.0,1391.0,0.0,0.0,3.0,0.0,8.0,7,10,2017
7,American Airlines,ORD,SJC,1715,1713.0,-2.0,25.0,1738.0,1938.0,15.0,...,240.0,1829.0,0.0,0.0,3.0,0.0,8.0,7,10,2017
8,American Airlines,ORD,DFW,2140,2301.0,81.0,25.0,2326.0,114.0,8.0,...,108.0,802.0,10.0,0.0,6.0,0.0,71.0,7,10,2017
10,American Airlines,LAX,MIA,600,556.0,-4.0,24.0,620.0,1420.0,9.0,...,300.0,2342.0,0.0,0.0,3.0,0.0,8.0,7,10,2017


In [32]:
#print insights on number of origin and dest 
categorical = ["ORIGIN","DEST"] 
print("Number of departure locations : ",len(dataset_ML[categorical[0]].unique()))
print("Number of destination locations : ",len(dataset_ML[categorical[1]].unique()))

Number of departure locations :  349
Number of destination locations :  349


### Looking at the ORIGIN and DEST features , they have huge number of unique values , this means that :
- using hot encoding for converting to numerical features will yield huge number of features which will lead to the curse of dimensionality 
- using label encoding will yield to only one feature , but the feature values will have great std deviation and labels will have different priority
- so it would be better if we drop both

### for the op_carrier :
- different carriers may differ in plane services but irrelevant to our problem so it will be removed

### FL_Date is only used for visualization so it will be removed , same for month day and year features
- we could have made use of month feature because of its relation to a season in a year , but the data is only based on domestic flights of the US so it will biased to seasonality in the US only , model needs to generalize regardless

In [33]:
dataset_ML=dataset_ML.drop('OP_CARRIER', axis=1)

In [34]:
dataset_ML=dataset_ML.drop('ORIGIN',axis=1)

In [35]:
dataset_ML=dataset_ML.drop('DEST',axis=1)

In [36]:
dataset_ML=dataset_ML.drop('Month',axis=1)

In [37]:
dataset_ML=dataset_ML.drop('Day',axis=1)

In [38]:
dataset_ML=dataset_ML.drop('Year',axis=1)

In [39]:
display(dataset_ML.head(n=5))

Unnamed: 0,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,730,724.0,-6.0,14.0,738.0,855.0,23.0,916,918.0,2.0,0.0,166.0,174.0,137.0,985.0,0.0,0.0,3.0,0.0,8.0
1,700,655.0,-5.0,11.0,706.0,910.0,19.0,932,929.0,-3.0,0.0,152.0,154.0,124.0,853.0,0.0,0.0,3.0,0.0,8.0
2,1245,1240.0,-5.0,12.0,1252.0,1334.0,2.0,1349,1336.0,-13.0,0.0,64.0,56.0,42.0,237.0,0.0,0.0,3.0,0.0,8.0
3,1430,1429.0,-1.0,20.0,1449.0,1527.0,10.0,1543,1537.0,-6.0,0.0,73.0,68.0,38.0,237.0,0.0,0.0,3.0,0.0,8.0
4,1205,1208.0,3.0,23.0,1231.0,1453.0,3.0,1450,1456.0,6.0,0.0,165.0,168.0,142.0,1139.0,0.0,0.0,3.0,0.0,8.0


In [39]:
#write the dataset for visualization using parquet dataformat , fast and low memory usage
dataset_ML.to_parquet("./Dataset/dataset_ML.parquet",index=False)