<center><span style="color:#b30000;font-size:35px;"><strong>Modeling Phase</strong></span></center>

<span style="color:#2929a3;font-size:20px;">Import Libraries</span>

In [619]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from category_encoders import BinaryEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor , AdaBoostRegressor
from xgboost import XGBRegressor

<span style="color:#2929a3;font-size:20px;">Read Dataset</span>

In [620]:
df = pd.read_pickle('Data/df_cleaned.pkl')

<span style="color:#2929a3;font-size:20px;">Show Sample of The Dataset </span>

In [621]:
df.sample(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_DayName,Journey_MonthName,Arrival_Date
3072,Air India,2019-04-01,Kolkata,Banglore,CCU → BOM → HBX → BLR,16:50:00,18:20:00,1530,2,No info,7893,1,Monday,April,2019-04-02
1907,Air Asia,2019-06-01,Delhi,Cochin,DEL → BLR → COK,20:45:00,07:10:00,625,1,No info,11410,1,Saturday,June,2019-06-02
189,Jet Airways,2019-03-12,Banglore,New Delhi,BLR → BOM → DEL,20:35:00,15:15:00,1120,1,In-flight meal not included,11087,12,Tuesday,March,2019-03-13
2016,IndiGo,2019-03-01,Banglore,New Delhi,BLR → DEL,18:55:00,21:45:00,170,0,No info,14306,1,Friday,March,2019-03-01
9285,Jet Airways,2019-05-21,Kolkata,Banglore,CCU → BOM → BLR,14:05:00,08:15:00,1090,1,In-flight meal not included,9663,21,Tuesday,May,2019-05-22


<span style="color:#2929a3;font-size:20px;">Show Shape of Dataset </span>

In [622]:
df.shape

(10169, 15)

<span style="color:#2929a3;font-size:20px;">Check Null Values </span>

In [623]:
df.isnull().sum()

Airline              0
Date_of_Journey      0
Source               0
Destination          0
Route                0
Dep_Time             0
Arrival_Time         0
Duration             0
Total_Stops          0
Additional_Info      0
Price                0
Journey_Day          0
Journey_DayName      0
Journey_MonthName    0
Arrival_Date         0
dtype: int64

<span style="color:#2929a3;font-size:20px;">Keep Only Important Columns </span>

In [624]:
df = df[['Airline','Source','Destination','Route','Duration','Journey_Day','Journey_MonthName','Price']]

<span style="color:#2929a3;font-size:20px;">Check Duplications </span>

In [625]:
df.duplicated().sum()

782

In [626]:
## Drop Duplicates
df.drop_duplicates(inplace=True)

In [627]:
## Reset Index
df.reset_index(drop=True, inplace=True)

In [628]:
## Check Shape
df.shape

(9387, 8)

<span style="color:#2929a3;font-size:20px;">Convert "Journey_Day" and "Journey_MonthName" to Numeric </span>

In [629]:
df['Journey_Day'] = df['Journey_Day'].astype(int)

In [630]:
df['Journey_MonthName'] = df['Journey_MonthName'].replace({'March':3, 'April':4, 'May':5, 'June':6})

### Note
#### I transformed the day and month features to numeric values to help the model capture the price changes that happened over the 4 months

<span style="color:#2929a3;font-size:20px;">Calculate the Log of the "Price" </span>

In [631]:
px.histogram(data_frame=df, x='Price', title='Price Distribution')

In [632]:
df['Price'] = df['Price'].apply(lambda x:np.log10(x))

In [633]:
px.histogram(data_frame=df, x='Price', title='Log of Price Distribution')

<span style="color:#2929a3;font-size:20px;">Split Data to X and Y </span>

In [634]:
X = df.drop('Price', axis=1)
Y = df['Price']

<span style="color:#2929a3;font-size:20px;">Create Preprocessor Column Transformer</span>

In [635]:
preprocessor = ColumnTransformer(transformers=[('Encoder',BinaryEncoder(),['Airline','Source','Destination','Route','Journey_MonthName'])], remainder='passthrough')

<span style="color:#2929a3;font-size:20px;">Model Selection Process</span>

In [636]:
Models = []
Models.append(('Linear Regression',LinearRegression()))
Models.append(('Lasso',Lasso()))
Models.append(('Ridge',Ridge()))
Models.append(('KNN',KNeighborsRegressor()))
Models.append(('SVM',SVR()))
Models.append(('Decision Tree',DecisionTreeRegressor()))
Models.append(('Random Forest',RandomForestRegressor()))
Models.append(('XGBoost',XGBRegressor()))
Models.append(('Adaptive Boosting',AdaBoostRegressor()))

In [637]:
for model in Models:
    steps = []
    steps.append(('Preprocessor',preprocessor))
    steps.append(('Scaler',RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps=steps)
    res = cross_validate(pipeline, X, Y, cv=5, scoring='r2', return_train_score=True)
    print(f"{model[0]} Train R2-Score is {res['train_score'].mean()}")
    print(f"{model[0]} Test R2-Score is {res['test_score'].mean()}")
    print('*' * 50)

Linear Regression Train R2-Score is 0.5724219705033403
Linear Regression Test R2-Score is 0.5622442459775847
**************************************************
Lasso Train R2-Score is 0.0
Lasso Test R2-Score is -0.001143877764228174
**************************************************
Ridge Train R2-Score is 0.5724552235005559
Ridge Test R2-Score is 0.5625157059477282
**************************************************
KNN Train R2-Score is 0.887969440237773
KNN Test R2-Score is 0.829645899978243
**************************************************
SVM Train R2-Score is 0.8408925082074417
SVM Test R2-Score is 0.8257739584068062
**************************************************
Decision Tree Train R2-Score is 0.9567787158257894
Decision Tree Test R2-Score is 0.763112292922427
**************************************************
Random Forest Train R2-Score is 0.947681256060165
Random Forest Test R2-Score is 0.8329573369685519
**************************************************
XGBoost Train R2

<span style="color:#2929a3;font-size:20px;">Create XGBoost Pipeline</span>

In [638]:
steps = []
steps.append(('Preprocessor', preprocessor))
steps.append(('Scaler',RobustScaler()))
steps.append(('Model',XGBRegressor()))
pipeline = Pipeline(steps = steps)

<span style="color:#2929a3;font-size:20px;">Hyperparameter Tuning</span>

In [639]:
from sklearn.model_selection import GridSearchCV

In [640]:
param = [{
    'Model__n_estimators':[135,140,145,150],
    'Model__max_depth':[4,5,10],
    'Model__max_leaves':[1,2,3]
}]

In [641]:
grid = GridSearchCV(pipeline, param_grid=param, cv=5, scoring='r2', return_train_score=True)

In [642]:
grid.fit(X,Y)

In [643]:
grid.cv_results_['mean_train_score'][grid.best_index_]

0.9166752795091696

In [644]:
grid.cv_results_['mean_test_score'][grid.best_index_]

0.8743749841903551

In [645]:
grid.best_params_

{'Model__max_depth': 5, 'Model__max_leaves': 1, 'Model__n_estimators': 135}

In [646]:
Final_Model = grid.best_estimator_

In [647]:
Final_Model.fit(X,Y)

<span style="color:#2929a3;font-size:20px;">Dump the Final Model</span>

In [648]:
import joblib

In [649]:
joblib.dump(Final_Model, 'Data/Final_Model.pkl')

['Data/Final_Model.pkl']

<span style="color:#2929a3;font-size:20px;">Dump the Inputs</span>

In [650]:
## Original Dict
Inputs_dict = {}

In [651]:
## Append Name of columns
Inputs_dict['inputs'] = df.columns.tolist()

In [652]:
## Append Unique Values of each Categorical Feature
Inputs_dict['Airlines'] = df['Airline'].unique().tolist()
Inputs_dict['Source'] = df['Source'].unique().tolist()
Inputs_dict['Destination'] = df['Destination'].unique().tolist()

## Create 2 sub dict inside original dict
Inputs_dict['Source_Destination'] = {}
Inputs_dict['Destination_Route'] = {}

In [653]:
## Append Destinations of each Source
Inputs_dict['Source_Destination']['Chennai'] = ['Kolkata']
Inputs_dict['Source_Destination']['Kolkata'] = ['Banglore']
Inputs_dict['Source_Destination']['Banglore'] = ['Delhi','New Delhi']
Inputs_dict['Source_Destination']['Delhi'] = ['Cochin']
Inputs_dict['Source_Destination']['Mumbai'] = ['Hyderabad']

In [654]:
## Append Routes of each Destination
Inputs_dict['Destination_Route']['Kolkata'] = df[df['Destination'] == 'Kolkata']['Route'].unique().tolist()
Inputs_dict['Destination_Route']['Banglore'] = df[df['Destination'] == 'Banglore']['Route'].unique().tolist()
Inputs_dict['Destination_Route']['Delhi'] = df[df['Destination'] == 'Delhi']['Route'].unique().tolist()
Inputs_dict['Destination_Route']['New Delhi'] = df[df['Destination'] == 'New Delhi']['Route'].unique().tolist()
Inputs_dict['Destination_Route']['Cochin'] = df[df['Destination'] == 'Cochin']['Route'].unique().tolist()
Inputs_dict['Destination_Route']['Hyderabad'] = df[df['Destination'] == 'Hyderabad']['Route'].unique().tolist()

In [655]:
## Dump Inputs Dict
joblib.dump(Inputs_dict, "Data/Inputs_dict.pkl")

['Data/Inputs_dict.pkl']