In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.dataset as ds

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


# Loading the Data and observing

In [5]:
cols = [
    'Airline','Origin','Dest','CRSDepTime',
    'Month','DayOfWeek','Distance','DistanceGroup',
    'DepDel15','DepDelayMinutes',
    'ArrDel15','ArrDelayMinutes'
]

df = pd.read_parquet('./data/2021_delay_ds.parquet', columns=cols)

In [None]:
data.shape

(4078318, 61)

In [6]:
df.shape


(6311871, 12)

In [None]:
df.describe()

Unnamed: 0,CRSDepTime,Month,DayOfWeek,Distance,DistanceGroup,DepDel15,DepDelayMinutes,ArrDel15,ArrDelayMinutes
count,6311871.0,6311871.0,6311871.0,6311871.0,6311871.0,6203458.0,6203458.0,6185870.0,6185870.0
mean,1323.953,6.969905,4.013676,795.5762,3.653994,0.1731723,12.76132,0.1726611,12.52928
std,474.1972,3.300914,2.006264,583.2931,2.281563,0.3783962,47.36319,0.377954,46.7477
min,1.0,1.0,1.0,31.0,1.0,0.0,0.0,0.0,0.0
25%,920.0,4.0,2.0,370.0,2.0,0.0,0.0,0.0,0.0
50%,1315.0,7.0,4.0,646.0,3.0,0.0,0.0,0.0,0.0
75%,1723.0,10.0,6.0,1033.0,5.0,0.0,6.0,0.0,6.0
max,2359.0,12.0,7.0,5812.0,11.0,1.0,3095.0,1.0,3089.0


In [8]:
df.isnull().sum().sort_values(ascending=False).head(20)

ArrDelayMinutes    126001
ArrDel15           126001
DepDelayMinutes    108413
DepDel15           108413
CRSDepTime              0
Dest                    0
Origin                  0
Airline                 0
DistanceGroup           0
Distance                0
DayOfWeek               0
Month                   0
dtype: int64

# Tasks 
- 4 predictions 
    - Classification
        1. ArrivalDelay -> Binary classification 
            - Will the flight reach the destination late ?
        2. DepartureDelay -> Binary Classification
            - Will the flight take off late ? 
    - Regression
        1. ArrivalDelay
        2. DepartureDelay


## TASK 1 : DepartureDelay (Binary Classification)
### Does the flight takes off late ?

In [9]:
# Remove rows where the target departure delay 15 is missing (null) as label is missing 
df_departed = df[df['DepDel15'].notna()].copy()

In [11]:
df_departed.shape

(6203458, 12)

In [15]:
df_departed.columns

Index(['Airline', 'Origin', 'Dest', 'CRSDepTime', 'Month', 'DayOfWeek',
       'Distance', 'DistanceGroup', 'DepDel15', 'DepDelayMinutes', 'ArrDel15',
       'ArrDelayMinutes'],
      dtype='str')

In [14]:
df_departed.info()

<class 'pandas.DataFrame'>
Index: 6203458 entries, 0 to 573778
Data columns (total 12 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Airline          str    
 1   Origin           str    
 2   Dest             str    
 3   CRSDepTime       int64  
 4   Month            int64  
 5   DayOfWeek        int64  
 6   Distance         float64
 7   DistanceGroup    int64  
 8   DepDel15         float64
 9   DepDelayMinutes  float64
 10  ArrDel15         float64
 11  ArrDelayMinutes  float64
dtypes: float64(5), int64(4), str(3)
memory usage: 770.1 MB


In [25]:
df_departed[df_departed['DepDel15']==1].head()

Unnamed: 0,Airline,Origin,Dest,CRSDepTime,Month,DayOfWeek,Distance,DistanceGroup,DepDel15,DepDelayMinutes,ArrDel15,ArrDelayMinutes
36,SkyWest Airlines Inc.,FSM,DFW,617,3,3,227.0,1,1.0,22.0,0.0,14.0
51,SkyWest Airlines Inc.,DFW,FLG,2045,3,3,853.0,4,1.0,148.0,1.0,105.0
59,SkyWest Airlines Inc.,DFW,ASE,840,3,3,701.0,3,1.0,18.0,0.0,4.0
62,SkyWest Airlines Inc.,PHX,IAH,2005,3,3,1009.0,5,1.0,35.0,0.0,1.0
66,SkyWest Airlines Inc.,PHX,SLC,1843,3,3,507.0,3,1.0,109.0,1.0,89.0


In [26]:
features_departed_classification = [
'Airline', 'Origin', 'Dest', 'CRSDepTime', 'Month', 'DayOfWeek',
       'Distance', 'DistanceGroup', 'DepDel15', 'DepDelayMinutes'
]

x = df_departed[features_departed_classification]
y = df_departed['DepDel15']

In [37]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2,random_state=8, stratify=y)

In [38]:
categorical_cols = ['Airline','Origin','Dest']

numeric_cols = [
    'CRSDepTime',
    'Month',
    'DayOfWeek',
    'Distance',
    'DistanceGroup'
]


In [39]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)


In [40]:
X_train_encoded = preprocessor.fit_transform(x_train)
X_test_encoded = preprocessor.transform(x_test)



MemoryError: Unable to allocate 151. MiB for an array with shape (39702128,) and data type int32

## TASK 2 : ArrivalDelay (Binary Classification) 
### Does the flight reaches destination late ?