# Predicting arrival delay using Linear Regression

In [1]:
# importing necessary packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# Reading dataset into pandas' dataframe
flights_df = pd.read_csv("../data/outdated/monthly_data/AllMonths_RAW.csv", header=0)

In [3]:
flights_df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,...,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2022,1,3,1/12/2022 12:00:00 AM,YX,N420YX,4904,LGA,BNA,620,...,0.0,167.0,152.0,126.0,764.0,,,,,
1,2022,1,4,1/13/2022 12:00:00 AM,YX,N124HQ,4904,LGA,BNA,620,...,0.0,167.0,151.0,124.0,764.0,,,,,
2,2022,1,5,1/14/2022 12:00:00 AM,YX,N425YX,4904,LGA,BNA,620,...,0.0,167.0,131.0,112.0,764.0,,,,,
3,2022,1,7,1/16/2022 12:00:00 AM,YX,N106HQ,4904,LGA,BNA,620,...,0.0,167.0,157.0,133.0,764.0,,,,,
4,2022,1,1,1/17/2022 12:00:00 AM,YX,N450YX,4904,LGA,BNA,620,...,0.0,167.0,143.0,120.0,764.0,,,,,


In [4]:
flights_dropped_df = flights_df.drop([
    'ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'ARR_DEL15',
 'ARR_DELAY',
 'ARR_TIME',
 'ARR_TIME_BLK',
 'CANCELLATION_CODE',
 'CANCELLED',
 'CARRIER_DELAY',
#  'CRS_ARR_TIME',
#  'CRS_DEP_TIME',
#  'CRS_ELAPSED_TIME',
#  'DAY_OF_WEEK',
 'DEP_DEL15',
#  'DEP_DELAY',
 'DEP_TIME',
 'DEP_TIME_BLK',
#  'DEST',
#  'DISTANCE',
 'DIVERTED',
 'FL_DATE',
 'LATE_AIRCRAFT_DELAY',
#  'MONTH',
 'NAS_DELAY',
#  'OP_CARRIER_FL_NUM',
#  'OP_UNIQUE_CARRIER',
#  'ORIGIN',
 'SECURITY_DELAY',
 'TAIL_NUM',
 'TAXI_IN',
 'TAXI_OUT',
 'WEATHER_DELAY',
 'WHEELS_OFF',
 'WHEELS_ON',
#  'YEAR'
],axis=1)

In [5]:
flights_dropped_df.isna().value_counts()

YEAR   MONTH  DAY_OF_WEEK  OP_UNIQUE_CARRIER  OP_CARRIER_FL_NUM  ORIGIN  DEST   CRS_DEP_TIME  DEP_DELAY  CRS_ARR_TIME  CRS_ELAPSED_TIME  DISTANCE
False  False  False        False              False              False   False  False         False      False         False             False       291604
                                                                                              True       False         False             False        15886
dtype: int64

In [6]:
flights_dropped_df.shape

(307490, 12)

In [7]:
flights_df["CANCELLED"][flights_df["DEP_DELAY"].isna() == False]

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
307485    0.0
307486    0.0
307487    0.0
307488    0.0
307489    0.0
Name: CANCELLED, Length: 291604, dtype: float64

In [8]:
flights_dropped_df.drop(flights_dropped_df[flights_dropped_df.DEP_DELAY.isna() == True].index, inplace=True)

In [9]:
flights_dropped_df.shape

(291604, 12)

In [10]:

# boxplot = flights_dropped_df.boxplot(column=['DEP_DELAY'])

In [11]:
# flights_dropped_df["DEP_DELAY"][flights_dropped_df['DEP_DELAY'] == 299].describe()

In [12]:
#Function that classifies delay into  sections
def classify(num):
    # if (num < 0):
    #     return ('No Delay') 
    if (num < 0):
        if (num < -30):
            return ('Early: > 30 mins') 
        else:
            return ('Early: < 30 mins')
    else:
        # if (num < 15):
        #     return ('Late: < 15 mins')
        # elif (num < 30):
        #     return ('Late: < 30 mins')
        # elif (num < 60):
        #     return ('Late: < 1 hours')
        # elif (num < 90):
        #     return ('Late: < 1.5 hours')
        # elif (num < 120):
        #     return ('Late: < 2 hours')
        # elif (num < 150):
        #     return ('Late: < 2.5 hours')
        # elif (num < 180):
        #     return ('Late: < 3 hours')
        # elif (num < 240):
        #     return ('Late: < 4 hours')
        # elif (num < 300):
        #     return ('Late: < 5 hours')
        # else:
        #     return ('Late: > 5 hours')
        if (num < 30):
            return ('Late: < 30 mins')
        elif (num < 250):
            return ('Late: < 5 hours')
        else:
            return ('Late: > 5 hours')

# Transforming arrival delay to fall into one of 5 categories
flights_dropped_df['DEP_DELAY'] = flights_dropped_df['DEP_DELAY'].apply(lambda x: classify(x))

In [13]:
flights_dropped_df.isna().value_counts()

YEAR   MONTH  DAY_OF_WEEK  OP_UNIQUE_CARRIER  OP_CARRIER_FL_NUM  ORIGIN  DEST   CRS_DEP_TIME  DEP_DELAY  CRS_ARR_TIME  CRS_ELAPSED_TIME  DISTANCE
False  False  False        False              False              False   False  False         False      False         False             False       291604
dtype: int64

In [14]:
flights_dropped_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291604 entries, 0 to 307489
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   YEAR               291604 non-null  int64  
 1   MONTH              291604 non-null  int64  
 2   DAY_OF_WEEK        291604 non-null  int64  
 3   OP_UNIQUE_CARRIER  291604 non-null  object 
 4   OP_CARRIER_FL_NUM  291604 non-null  int64  
 5   ORIGIN             291604 non-null  object 
 6   DEST               291604 non-null  object 
 7   CRS_DEP_TIME       291604 non-null  int64  
 8   DEP_DELAY          291604 non-null  object 
 9   CRS_ARR_TIME       291604 non-null  int64  
 10  CRS_ELAPSED_TIME   291604 non-null  float64
 11  DISTANCE           291604 non-null  float64
dtypes: float64(2), int64(6), object(4)
memory usage: 28.9+ MB


In [15]:
flights_dropped_df['DEP_DELAY'].value_counts()

Early: < 30 mins    167828
Late: < 30 mins      75375
Late: < 5 hours      45841
Late: > 5 hours       2551
Early: > 30 mins         9
Name: DEP_DELAY, dtype: int64

In [16]:
# Encoding categorical variables
flights_df_categorical = flights_dropped_df[['OP_UNIQUE_CARRIER','ORIGIN','DEST']]
categorical_df = pd.get_dummies(flights_df_categorical,drop_first=True)

In [17]:
# label enoding the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
flights_dropped_df['DEP_DELAY'] = le.fit_transform(flights_dropped_df['DEP_DELAY'].values)

In [18]:
# Dropping all categorical columns that are one-hot encoded, tail number and date columns
flights_dropped_df = flights_dropped_df.drop(['OP_UNIQUE_CARRIER','ORIGIN','DEST'],axis=1)

In [19]:
# concatenating one-hot encoded columns with other numerical columns
flights_dropped_df = pd.concat([flights_dropped_df,categorical_df],axis=1)

In [20]:
# Splitting data into features and target variable
X = flights_dropped_df.drop(['DEP_DELAY'], axis=1)
y = flights_dropped_df['DEP_DELAY']

In [21]:
# Splitting data into test and train sets
X_train,X_test,y_train, y_test = train_test_split(X,y,random_state=100,test_size=0.20,shuffle=True)

In [22]:
# Performing linear regression to predict the arrival delay
model = LinearRegression()
lr = model.fit(X_train,y_train)
print(lr.score(X_test,y_test))

0.09966619390926768
