In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../data/samples/sample.csv')
df

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2022-07-17,Alaska Airlines Inc.,JFK,PDX,False,False,1145,1139.0,0.0,-6.0,...,1210.0,1434.0,8.0,1450,-8.0,0.0,-1.0,1400-1459,10,0
1,2022-05-24,Southwest Airlines Co.,AUS,FLL,False,False,720,738.0,18.0,18.0,...,749.0,1104.0,3.0,1100,7.0,0.0,0.0,1100-1159,5,0
2,2022-06-09,SkyWest Airlines Inc.,LAX,RNO,False,False,949,947.0,0.0,-2.0,...,957.0,1102.0,7.0,1120,-11.0,0.0,-1.0,1100-1159,2,0
3,2022-05-08,American Airlines Inc.,DCA,CLT,False,False,650,640.0,0.0,-10.0,...,710.0,814.0,7.0,823,-2.0,0.0,-1.0,0800-0859,2,0
4,2022-06-09,JetBlue Airways,DCA,RSW,False,False,1304,1321.0,17.0,17.0,...,1337.0,1558.0,6.0,1547,17.0,1.0,1.0,1500-1559,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652526,2022-06-29,Delta Air Lines Inc.,OMA,ATL,False,False,541,534.0,0.0,-7.0,...,552.0,833.0,12.0,854,-9.0,0.0,-1.0,0800-0859,4,0
652527,2022-07-05,American Airlines Inc.,DCA,MIA,False,False,1155,1203.0,8.0,8.0,...,1222.0,1422.0,7.0,1434,-5.0,0.0,-1.0,1400-1459,4,0
652528,2022-05-31,American Airlines Inc.,CLT,LGA,False,False,550,546.0,0.0,-4.0,...,558.0,722.0,9.0,729,2.0,0.0,0.0,0700-0759,3,0
652529,2022-05-21,Endeavor Air Inc.,LGA,GSO,False,False,825,910.0,45.0,45.0,...,932.0,1039.0,7.0,1020,26.0,1.0,1.0,1000-1059,2,0


In [4]:
df.columns

Index(['FlightDate', 'Airline', 'Origin', 'Dest', 'Cancelled', 'Diverted',
       'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDelay', 'ArrTime',
       'ArrDelayMinutes', 'AirTime', 'CRSElapsedTime', 'ActualElapsedTime',
       'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners',
       'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline',
       'Flight_Number_Marketing_Airline', 'Operating_Airline',
       'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',
       'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName',
       'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'DepDel15',
       'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOu

# Preprocessing

In [5]:
dest_col_drop = list(filter(lambda c: c.startswith('Dest') and c not in ['Dest', 'DestCityName', 'DestStateName', 'DestWac'], df.columns))
dest_col_drop

['DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'DestState',
 'DestStateFips']

In [6]:
origin_col_drop = list(filter(lambda c: c.startswith('Origin') and c not in ['Origin', 'OriginCityName', 'OriginStateName', 'OriginWac'], df.columns))
origin_col_drop

['OriginAirportID',
 'OriginAirportSeqID',
 'OriginCityMarketID',
 'OriginState',
 'OriginStateFips']

In [7]:
dep_col_drop = list(filter(lambda c: c.startswith('Dep') and c not in ['DepTime', 'DepDelay'], df.columns))
dep_col_drop

['DepDelayMinutes', 'DepDel15', 'DepartureDelayGroups', 'DepTimeBlk']

In [8]:
arr_col_drop = list(filter(lambda c: c.startswith('Arr') and c not in ['ArrTime', 'ArrDelay'], df.columns))
arr_col_drop

['ArrDelayMinutes', 'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk']

In [9]:
from datetime import datetime
flight_date_formatted = df['FlightDate'].map(lambda d: datetime.strptime(d, '%Y-%m-%d'))
time_col_drop = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek']
flight_date_formatted

0        2022-07-17
1        2022-05-24
2        2022-06-09
3        2022-05-08
4        2022-06-09
            ...    
652526   2022-06-29
652527   2022-07-05
652528   2022-05-31
652529   2022-05-21
652530   2022-07-09
Name: FlightDate, Length: 652531, dtype: datetime64[ns]

In [10]:
misc_col_drop = ['DistanceGroup']

In [11]:
df.drop(
    [
        *origin_col_drop,
        *dest_col_drop,
        *dep_col_drop,
        *arr_col_drop,
        *time_col_drop,
        *misc_col_drop,
    ],
    axis=1,
    inplace=True
)

In [12]:
df['Delayed'] = np.where(df['DepDelay'] > 0, True, False)
df['On-time'] = np.where(df['DepDelay'] <= 0, True, False)
df

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelay,ArrTime,...,DestWac,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,DivAirportLandings,Delayed,On-time
0,2022-07-17,Alaska Airlines Inc.,JFK,PDX,False,False,1145,1139.0,-6.0,1442.0,...,92,31.0,1210.0,1434.0,8.0,1450,-8.0,0,False,True
1,2022-05-24,Southwest Airlines Co.,AUS,FLL,False,False,720,738.0,18.0,1107.0,...,33,11.0,749.0,1104.0,3.0,1100,7.0,0,True,False
2,2022-06-09,SkyWest Airlines Inc.,LAX,RNO,False,False,949,947.0,-2.0,1109.0,...,85,10.0,957.0,1102.0,7.0,1120,-11.0,0,False,True
3,2022-05-08,American Airlines Inc.,DCA,CLT,False,False,650,640.0,-10.0,821.0,...,36,30.0,710.0,814.0,7.0,823,-2.0,0,False,True
4,2022-06-09,JetBlue Airways,DCA,RSW,False,False,1304,1321.0,17.0,1604.0,...,33,16.0,1337.0,1558.0,6.0,1547,17.0,0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652526,2022-06-29,Delta Air Lines Inc.,OMA,ATL,False,False,541,534.0,-7.0,845.0,...,34,18.0,552.0,833.0,12.0,854,-9.0,0,False,True
652527,2022-07-05,American Airlines Inc.,DCA,MIA,False,False,1155,1203.0,8.0,1429.0,...,33,19.0,1222.0,1422.0,7.0,1434,-5.0,0,True,False
652528,2022-05-31,American Airlines Inc.,CLT,LGA,False,False,550,546.0,-4.0,731.0,...,22,12.0,558.0,722.0,9.0,729,2.0,0,False,True
652529,2022-05-21,Endeavor Air Inc.,LGA,GSO,False,False,825,910.0,45.0,1046.0,...,36,22.0,932.0,1039.0,7.0,1020,26.0,0,True,False


In [13]:
conditions = [df['Cancelled'], df['Diverted'], df['Delayed'], df['On-time']]
values = [0, 1, 2, 3]

df['Status'] = np.select(conditions, values)
df.drop(['Cancelled', 'Diverted', 'Delayed', 'On-time'], axis=1, inplace=True)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652531 entries, 0 to 652530
Data columns (total 36 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   FlightDate                               652531 non-null  object 
 1   Airline                                  652531 non-null  object 
 2   Origin                                   652531 non-null  object 
 3   Dest                                     652531 non-null  object 
 4   CRSDepTime                               652531 non-null  int64  
 5   DepTime                                  633490 non-null  float64
 6   DepDelay                                 633476 non-null  float64
 7   ArrTime                                  632841 non-null  float64
 8   AirTime                                  631354 non-null  float64
 9   CRSElapsedTime                           652531 non-null  float64
 10  ActualElapsedTime               

In [34]:
from sklearn.model_selection import train_test_split

X = df.drop(['Status'], axis=1)
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [38]:
from sklearn.feature_extraction import FeatureHasher


hasher = FeatureHasher(input_type='string')
X_trans = hasher.fit_transform(X)
X_trans


ValueError: Samples can not be a single string. The input must be an iterable over iterables of strings.

# EDA
Trying to find any relations within data

In [14]:
import matplotlib.pyplot as plt
import seaborn as sn

In [16]:
df['Airline'] = df['Airline'].astype('str')