# Predicting arrival delay using Linear Regression

In [9]:
# importing necessary packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [13]:
# Reading dataset into pandas' dataframe
flights_df = pd.read_csv("Allmonths_Flight_Data.csv", header=0)

In [14]:
flights_df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,...,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_BLK,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE
0,2022,1,3,1/12/2022 12:00:00 AM,YX,N420YX,4904,LGA,BNA,620,...,807,748.0,-19.0,0.0,0800-0859,0.0,167.0,152.0,126.0,764.0
1,2022,1,4,1/13/2022 12:00:00 AM,YX,N124HQ,4904,LGA,BNA,620,...,807,745.0,-22.0,0.0,0800-0859,0.0,167.0,151.0,124.0,764.0
2,2022,1,5,1/14/2022 12:00:00 AM,YX,N425YX,4904,LGA,BNA,620,...,807,800.0,-7.0,0.0,0800-0859,0.0,167.0,131.0,112.0,764.0
3,2022,1,7,1/16/2022 12:00:00 AM,YX,N106HQ,4904,LGA,BNA,620,...,807,746.0,-21.0,0.0,0800-0859,0.0,167.0,157.0,133.0,764.0
4,2022,1,1,1/17/2022 12:00:00 AM,YX,N450YX,4904,LGA,BNA,620,...,807,755.0,-12.0,0.0,0800-0859,0.0,167.0,143.0,120.0,764.0


In [16]:
flights_dropped_df = flights_df.drop([
    'ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'ARR_DEL15',
 'ARR_DELAY',
 'ARR_TIME',
 'ARR_TIME_BLK',
## 'CANCELLATION_CODE',
 'CANCELLED',
## 'CARRIER_DELAY',
#  'CRS_ARR_TIME',
#  'CRS_DEP_TIME',
#  'CRS_ELAPSED_TIME',
#  'DAY_OF_WEEK',
 'DEP_DEL15',
#  'DEP_DELAY',
 'DEP_TIME',
 'DEP_TIME_BLK',
#  'DEST',
#  'DISTANCE',
## 'DIVERTED',
 'FL_DATE',
## 'LATE_AIRCRAFT_DELAY',
#  'MONTH',
## 'NAS_DELAY',
#  'OP_CARRIER_FL_NUM',
#  'OP_UNIQUE_CARRIER',
#  'ORIGIN',
## 'SECURITY_DELAY',
 'TAIL_NUM',
 'TAXI_IN',
 'TAXI_OUT',
## 'WEATHER_DELAY',
 'WHEELS_OFF',
 'WHEELS_ON',
#  'YEAR'
],axis=1)

In [17]:
flights_dropped_df.isna().value_counts()

YEAR   MONTH  DAY_OF_WEEK  OP_UNIQUE_CARRIER  OP_CARRIER_FL_NUM  ORIGIN  DEST   CRS_DEP_TIME  DEP_DELAY  CRS_ARR_TIME  CRS_ELAPSED_TIME  DISTANCE
False  False  False        False              False              False   False  False         False      False         False             False       290193
dtype: int64

In [18]:
flights_df["CANCELLED"][flights_df["DEP_DELAY"].isna() == False]

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
290188    0.0
290189    0.0
290190    0.0
290191    0.0
290192    0.0
Name: CANCELLED, Length: 290193, dtype: float64

In [19]:
flights_dropped_df.drop(flights_dropped_df[flights_dropped_df.DEP_DELAY.isna() == True].index, inplace=True)

In [20]:
flights_dropped_df.isna().value_counts()

YEAR   MONTH  DAY_OF_WEEK  OP_UNIQUE_CARRIER  OP_CARRIER_FL_NUM  ORIGIN  DEST   CRS_DEP_TIME  DEP_DELAY  CRS_ARR_TIME  CRS_ELAPSED_TIME  DISTANCE
False  False  False        False              False              False   False  False         False      False         False             False       290193
dtype: int64

In [21]:

# boxplot = flights_dropped_df.boxplot(column=['DEP_DELAY'])

In [22]:
# flights_dropped_df["DEP_DELAY"][flights_dropped_df['DEP_DELAY'] == 299].describe()

In [23]:
#Function that classifies delay into  sections
def classify(num):
    if (num < 0):
        return ('No Delay') 
    else:
        if (num < 15):
            return ('Late: < 15 mins')
        elif (num < 30):
            return ('Late: < 30 mins')
        elif (num < 60):
            return ('Late: < 1 hours')
        elif (num < 90):
            return ('Late: < 1.5 hours')
        elif (num < 120):
            return ('Late: < 2 hours')
        elif (num < 150):
            return ('Late: < 2.5 hours')
        elif (num < 180):
            return ('Late: < 3 hours')
        elif (num < 240):
            return ('Late: < 4 hours')
        elif (num < 300):
            return ('Late: < 5 hours')
        else:
            return ('Late: > 5 hours')


# Transforming arrival delay to fall into one of 5 categories
flights_dropped_df['DEP_DELAY'] = flights_dropped_df['DEP_DELAY'].apply(lambda x: classify(x))

In [24]:
flights_dropped_df.isna().value_counts()

YEAR   MONTH  DAY_OF_WEEK  OP_UNIQUE_CARRIER  OP_CARRIER_FL_NUM  ORIGIN  DEST   CRS_DEP_TIME  DEP_DELAY  CRS_ARR_TIME  CRS_ELAPSED_TIME  DISTANCE
False  False  False        False              False              False   False  False         False      False         False             False       290193
dtype: int64

In [25]:
flights_dropped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290193 entries, 0 to 290192
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   YEAR               290193 non-null  int64  
 1   MONTH              290193 non-null  int64  
 2   DAY_OF_WEEK        290193 non-null  int64  
 3   OP_UNIQUE_CARRIER  290193 non-null  object 
 4   OP_CARRIER_FL_NUM  290193 non-null  int64  
 5   ORIGIN             290193 non-null  object 
 6   DEST               290193 non-null  object 
 7   CRS_DEP_TIME       290193 non-null  int64  
 8   DEP_DELAY          290193 non-null  object 
 9   CRS_ARR_TIME       290193 non-null  int64  
 10  CRS_ELAPSED_TIME   290193 non-null  float64
 11  DISTANCE           290193 non-null  float64
dtypes: float64(2), int64(6), object(4)
memory usage: 26.6+ MB


In [26]:
flights_dropped_df['DEP_DELAY'].value_counts()

No Delay             167409
Late: < 15 mins       54449
Late: < 30 mins       20556
Late: < 1 hours       20177
Late: < 1.5 hours     10410
Late: < 2 hours        5978
Late: < 2.5 hours      3606
Late: < 4 hours        2516
Late: < 3 hours        2333
Late: > 5 hours        1555
Late: < 5 hours        1204
Name: DEP_DELAY, dtype: int64

In [27]:
# Encoding categorical variables
flights_df_categorical = flights_dropped_df[['OP_UNIQUE_CARRIER','ORIGIN','DEST']]
categorical_df = pd.get_dummies(flights_df_categorical,drop_first=True)

In [28]:
# label enoding the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
flights_dropped_df['DEP_DELAY'] = le.fit_transform(flights_dropped_df['DEP_DELAY'].values)

In [29]:
# Dropping all categorical columns that are one-hot encoded, tail number and date columns
flights_dropped_df = flights_dropped_df.drop(['OP_UNIQUE_CARRIER','ORIGIN','DEST'],axis=1)

In [30]:
# concatenating one-hot encoded columns with other numerical columns
flights_dropped_df = pd.concat([flights_dropped_df,categorical_df],axis=1)

In [34]:
# only for training purposes
flights_dropped_df = flights_dropped_df[:10000]

In [35]:
# Splitting data into features and target variable
X = flights_dropped_df.drop(['DEP_DELAY'], axis=1)
y = flights_dropped_df['DEP_DELAY']

In [36]:
# Splitting data into test and train sets
X_train,X_test,y_train, y_test = train_test_split(X,y,random_state=100,test_size=0.20,shuffle=True)

In [37]:
# Performing linear regression to predict the arrival delay
model = LogisticRegression()
lr = model.fit(X_train,y_train)
print(lr.score(X_test,y_test))

0.6835


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
print(lr.n_features_in_)
print(lr.feature_names_in_)

253
['YEAR' 'MONTH' 'DAY_OF_WEEK' 'OP_CARRIER_FL_NUM' 'CRS_DEP_TIME'
 'CRS_ARR_TIME' 'CRS_ELAPSED_TIME' 'DISTANCE' 'OP_UNIQUE_CARRIER_AA'
 'OP_UNIQUE_CARRIER_AS' 'OP_UNIQUE_CARRIER_B6' 'OP_UNIQUE_CARRIER_DL'
 'OP_UNIQUE_CARRIER_F9' 'OP_UNIQUE_CARRIER_G4' 'OP_UNIQUE_CARRIER_HA'
 'OP_UNIQUE_CARRIER_MQ' 'OP_UNIQUE_CARRIER_NK' 'OP_UNIQUE_CARRIER_OH'
 'OP_UNIQUE_CARRIER_OO' 'OP_UNIQUE_CARRIER_UA' 'OP_UNIQUE_CARRIER_WN'
 'OP_UNIQUE_CARRIER_YV' 'OP_UNIQUE_CARRIER_YX' 'ORIGIN_ACK' 'ORIGIN_AGS'
 'ORIGIN_ALB' 'ORIGIN_ATL' 'ORIGIN_AUS' 'ORIGIN_AVL' 'ORIGIN_BGM'
 'ORIGIN_BGR' 'ORIGIN_BHM' 'ORIGIN_BNA' 'ORIGIN_BOS' 'ORIGIN_BQN'
 'ORIGIN_BTV' 'ORIGIN_BUF' 'ORIGIN_BUR' 'ORIGIN_BWI' 'ORIGIN_BZN'
 'ORIGIN_CAE' 'ORIGIN_CHO' 'ORIGIN_CHS' 'ORIGIN_CLE' 'ORIGIN_CLT'
 'ORIGIN_CMH' 'ORIGIN_CVG' 'ORIGIN_DAL' 'ORIGIN_DAY' 'ORIGIN_DCA'
 'ORIGIN_DEN' 'ORIGIN_DFW' 'ORIGIN_DSM' 'ORIGIN_DTW' 'ORIGIN_EGE'
 'ORIGIN_ELM' 'ORIGIN_EWR' 'ORIGIN_EYW' 'ORIGIN_FLL' 'ORIGIN_GRR'
 'ORIGIN_GSO' 'ORIGIN_GSP' 'ORIGIN_HNL' 'ORIGIN

In [57]:
test_data = X_test.iloc[[1018]].values.tolist()
print(test_data)

[[2022.0, 1.0, 5.0, 4913.0, 1200.0, 1326.0, 86.0, 290.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [59]:
import numpy as np
lr.predict(np.array(test_data))



array([10])

In [66]:
le.inverse_transform([10])[0]

'No Delay'

In [65]:
# checking the actual delay recorded, predicted and actual delay is the same
y_test.iloc[[1018]]

115    10
Name: DEP_DELAY, dtype: int64

Index(['YEAR', 'MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_FL_NUM', 'CRS_DEP_TIME',
       'DEP_DELAY', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'DISTANCE',
       'OP_UNIQUE_CARRIER_AA',
       ...
       'DEST_SNA', 'DEST_SRQ', 'DEST_STL', 'DEST_STT', 'DEST_SWF', 'DEST_SYR',
       'DEST_TPA', 'DEST_TYS', 'DEST_VPS', 'DEST_XNA'],
      dtype='object', length=254)