# Predicting arrival delay using Linear Regression

In [2]:
# importing necessary packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
# Reading dataset into pandas' dataframe
flights_df = pd.read_csv("Allmonths_Flight_Data.csv", header=0)

In [4]:
flights_df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,...,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,ARR_TIME_BLK,CANCELLED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE
0,2022,1,3,1/12/2022 12:00:00 AM,YX,N420YX,4904,LGA,BNA,620,...,807,748.0,-19.0,0.0,0800-0859,0.0,167.0,152.0,126.0,764.0
1,2022,1,4,1/13/2022 12:00:00 AM,YX,N124HQ,4904,LGA,BNA,620,...,807,745.0,-22.0,0.0,0800-0859,0.0,167.0,151.0,124.0,764.0
2,2022,1,5,1/14/2022 12:00:00 AM,YX,N425YX,4904,LGA,BNA,620,...,807,800.0,-7.0,0.0,0800-0859,0.0,167.0,131.0,112.0,764.0
3,2022,1,7,1/16/2022 12:00:00 AM,YX,N106HQ,4904,LGA,BNA,620,...,807,746.0,-21.0,0.0,0800-0859,0.0,167.0,157.0,133.0,764.0
4,2022,1,1,1/17/2022 12:00:00 AM,YX,N450YX,4904,LGA,BNA,620,...,807,755.0,-12.0,0.0,0800-0859,0.0,167.0,143.0,120.0,764.0


In [5]:
#Function that classifies delay into 5 sections
def classify(num):
    if (num < 0):
        if (num < -30):
            return ('Early: > 30 mins') 
        else:
            return ('Early: < 30 mins')
    else:
        if (num < 30):
            return ('Late: < 30 mins')
        elif (num < 250):
            return ('Late: < 5 hours')
        else:
            return ('Late: > 5 hours')


# Transforming arrival delay to fall into one of 5 categories
flights_df['ARR_DELAY'] = flights_df['ARR_DELAY'].apply(lambda x: classify(x))

In [6]:
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290193 entries, 0 to 290192
Data columns (total 28 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   YEAR                 290193 non-null  int64  
 1   MONTH                290193 non-null  int64  
 2   DAY_OF_WEEK          290193 non-null  int64  
 3   FL_DATE              290193 non-null  object 
 4   OP_UNIQUE_CARRIER    290193 non-null  object 
 5   TAIL_NUM             290193 non-null  object 
 6   OP_CARRIER_FL_NUM    290193 non-null  int64  
 7   ORIGIN               290193 non-null  object 
 8   DEST                 290193 non-null  object 
 9   CRS_DEP_TIME         290193 non-null  int64  
 10  DEP_TIME             290193 non-null  float64
 11  DEP_DELAY            290193 non-null  float64
 12  DEP_DEL15            290193 non-null  float64
 13  DEP_TIME_BLK         290193 non-null  object 
 14  TAXI_OUT             290193 non-null  float64
 15  WHEELS_OFF       

In [7]:
# Encoding categorical variables
flights_df_categorical = flights_df[['OP_UNIQUE_CARRIER','ORIGIN','DEST','ARR_TIME_BLK','DEP_TIME_BLK']]
categorical_df = pd.get_dummies(flights_df_categorical,drop_first=True)

In [8]:
# label enoding the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
flights_df['ARR_DELAY'] = le.fit_transform(flights_df['ARR_DELAY'].values)

In [9]:
# Dropping all categorical columns that are one-hot encoded, tail number and date columns
flights_df = flights_df.drop(['OP_UNIQUE_CARRIER','TAIL_NUM','ORIGIN','DEST','ARR_TIME_BLK','DEP_TIME_BLK','FL_DATE'],axis=1)

In [10]:
# concatenating one-hot encoded columns with other numerical columns
flight_df = pd.concat([flights_df[:10000],categorical_df[:10000]],axis=1)

In [11]:
# Splitting data into features and target variable
X = flight_df.drop(['ARR_DELAY'], axis=1)
y = flight_df['ARR_DELAY']

In [12]:
# Splitting data into test and train sets
X_train,X_test,y_train, y_test = train_test_split(X,y,random_state=100,test_size=0.20,shuffle=True)

In [13]:
# Performing linear regression to predict the arrival delay
model = LinearRegression()
lr = model.fit(X_train,y_train)
print(lr.score(X_test,y_test))

0.7093620315995197
