# Assignment 1 :  Flight Delay Forecasting

- **Machine Learning, Innopolis University (Fall semester 2021)**
- **By Mohamed Gamal Elbayoumi**

### Setup

In [1]:
import os

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics

plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

**Import Data**

In [None]:
df = pd.read_csv('flight_delay.csv')
df.head()

**Exploratory data analysis (EDA)**

In [None]:
df.shape

In [None]:
# Get necessary details about dataset
df.info()

In [None]:
# show unique airports in dataset
print("Number of depature airports: {}".format(len(df['Depature Airport'].unique())))
print("Number of destination airports: {}".format(len(df['Destination Airport'].unique())))

In [None]:
# Looking for missing values
df.isna().sum()

**Observe :** Dataset have no missing values

In [None]:
# Any duplicates?
df.duplicated().sum()

In [None]:
# Delete these duplicate
df.drop_duplicates(inplace=True)

In [None]:
#Test
df.duplicated().sum()

## Data preprocessing and visualization

In [None]:
# Convert categorical data into numerical values
#from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# one-hot-encode categorical features
'''
def ohe_new_features(df, features_name, encoder):
    new_feats = encoder.transform(df[features_name])
    # create dataframe from encoded features with named columns
    new_cols = pd.DataFrame(new_feats, dtype=int, columns=encoder.get_feature_names(features_name))
    new_df = pd.concat([df, new_cols], axis=1)    
    new_df.drop(features_name, axis=1, inplace=True)
    return new_df

encoder = OneHotEncoder(sparse=False)
cat_features = ['Depature Airport', 'Destination Airport']
encoder.fit(df[cat_features])
df = ohe_new_features(df, cat_features, encoder)
df.head()
'''
encoder = LabelEncoder()
df['Depature Airport'] = encoder.fit_transform(df['Depature Airport'])
df['Destination Airport'] = encoder.fit_transform(df['Destination Airport'])
df.head()

In [None]:
#df = pd.get_dummies(df, columns=['Depature Airport', 'Destination Airport'])
#df.head()

In [None]:
# Converting depature time & arrival time from object to a sutaible format
df['Scheduled depature time'] = pd.to_datetime(df['Scheduled depature time'],format='%Y-%m-%d %H:%M:%S')
df['Scheduled arrival time']  = pd.to_datetime(df['Scheduled arrival time'],format='%Y-%m-%d %H:%M:%S')

In [None]:
# Adding new feature 'flight_duration'
flight_duration = df['Scheduled arrival time']-df['Scheduled depature time']
# Converting duration to be in hour and add it to data frame
flight_duration = flight_duration.dt.total_seconds()/3600
df['flight_duration'] = flight_duration
# Modify Daley feature to be in hour also, so the distances between values not far when bulding the model
df['Delay']=df['Delay']/60


In [None]:
# Extracting new features from Scheduled depature time column
df['year_dep']=df['Scheduled depature time'].dt.year 
df['month_dep']=df['Scheduled depature time'].dt.month 
df['day_dep']=df['Scheduled depature time'].dt.day
df['dayofweek_dep']=df['Scheduled depature time'].dt.dayofweek  
df['hour_dep'] = df['Scheduled depature time'].dt.hour 
df['minute_dep'] = df['Scheduled depature time'].dt.minute

# Extracting new features from Scheduled arrival time column
df['year_arr']=df['Scheduled arrival time'].dt.year 
df['month_arr']=df['Scheduled arrival time'].dt.month 
df['day_arr']=df['Scheduled arrival time'].dt.day
df['dayofweek_arr']=df['Scheduled arrival time'].dt.dayofweek  
df['hour_arr'] = df['Scheduled arrival time'].dt.hour 
df['minute_arr'] = df['Scheduled arrival time'].dt.minute

df = df.drop(['Scheduled depature time', 'Scheduled arrival time'], axis=1)

df.head()

## Outlier Detection & Removal

## Buliding Model

In [None]:
# Split train & test data depends on year's value
df_train = df[df['year_dep'] <= 2017]
df_test  = df[df['year_dep'] == 2018]
df_train.head()

In [None]:
X_train    = df_train.drop('Delay', axis = 1)
X_train_fd = df_train['flight_duration']
y_train    = df_train['Delay']

X_test     = df_test.drop('Delay', axis = 1)
X_test_fd  = df_test['flight_duration']
y_test     = df_test['Delay']

# Reshape data because it contains a single sample
X_train_fd = X_train_fd.values.reshape(-1, 1)
X_test_fd  = X_test_fd.values.reshape(-1,1)

print(X_train.shape, y_train.shape)

**Scale features**

In [None]:
# After trying other scalers this one is the best
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
# scale x with one feature
scaler.fit(X_train_fd)
X_train_fd = scaler.transform(X_train_fd)
X_test_fd = scaler.transform(X_test_fd)
# scale x with one feature
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Linear Regression

**simple linear regression**

In [None]:

regressor = LinearRegression()
regressor.fit(X_train_fd, y_train)

print(f"Model intercept : {regressor.intercept_}")
print(f"Model coefficient : {regressor.coef_}")

y_pred = regressor.predict(X_test_fd)
eval_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
eval_df.head()

In [None]:
# model perfomence metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# In practice, this model tends to underestimate the large delays, which can be seen in the following figure:
tips = pd.DataFrame()
tips["prediction"] = pd.Series([float(s) for s in y_pred]) 
tips["original_data"] = pd.Series([float(s) for s in y_test]) 

sns.jointplot(data=tips, x="original_data", y="prediction", size = 6, ratio = 7)

**multiple linear regression**

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print(f"Model intercept : {regressor.intercept_}")
print(f"Model coefficients : {regressor.coef_}")

y_pred = regressor.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
tips = pd.DataFrame()
tips["prediction"] = pd.Series([float(s) for s in y_pred]) 
tips["original_data"] = pd.Series([float(s) for s in y_test]) 

sns.jointplot(data=tips, x="original_data", y="prediction", size = 6, ratio = 7)

### Polynomial regression
Now I'll extend the previous fit by using a polynomial rather than a linear function:



**simple**

In [None]:
p = PolynomialFeatures(degree=3)
X_train_p = p.fit_transform(X_train_fd)
reg = LinearRegression()
y_train_ = reg.fit(X_train_p, y_train)

# The coefficients
print ('Coefficients: ', reg.coef_)
print ('Intercept: ',reg.intercept_)

In [None]:
X_test_p = p.fit_transform(X_test_fd)
prediction = reg.predict(X_test_p)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, prediction))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, prediction))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))

**Multiple features**

In [None]:
poly = PolynomialFeatures(degree=2)
X_train_p = poly.fit_transform(X_train)
reg = LinearRegression()
y_train_ = reg.fit(X_train_p, y_train)

# The coefficients
print ('Coefficients: ', reg.coef_)
print ('Intercept: ',reg.intercept_)