## Flight Price Prediction 

### Feature Selection and Model Building

In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [2]:
#importing dataset
df = pd.read_csv('https://raw.githubusercontent.com/Annie-Dhawan/machine-learning-projects/main/Flight_Prediction/Dataset/cleaned_data.csv')

In [3]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,Total_Stops,Price,Journey_Date,Journey_Month,Journey_Year,Dep_Hour,Dep_Min,Arrival_Hour,Arrival_Min,Duration_hours,Duration_mins,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,0,3897,24,3,2019,22,20,1,10,2,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2,7662,1,5,2019,5,50,13,15,7,25,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2,13882,9,6,2019,9,25,4,25,19,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,1,6218,12,5,2019,18,5,23,30,5,25,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,13302,1,3,2019,16,50,21,35,4,45,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [5]:
df.shape

(10682, 31)

In [6]:
X = df.drop('Price',axis=1)
y = df['Price'] #label

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

## Feature Selection

In [8]:
model = SelectFromModel(Lasso(alpha = 0.005,random_state=0))

In [9]:
model.fit(X_train,y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [10]:
model.get_support()

array([ True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True])

In [11]:
selected_features = X_train.columns[(model.get_support())]

In [12]:
selected_features

Index(['Total_Stops', 'Journey_Date', 'Journey_Month', 'Dep_Hour', 'Dep_Min',
       'Arrival_Hour', 'Arrival_Min', 'Duration_hours', 'Duration_mins',
       'Air India', 'GoAir', 'IndiGo', 'Jet Airways', 'Jet Airways Business',
       'Multiple carriers', 'Multiple carriers Premium economy', 'SpiceJet',
       'Trujet', 'Vistara', 'Vistara Premium economy', 'Source_Chennai',
       'Source_Delhi', 'Source_Mumbai', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_New Delhi'],
      dtype='object')

In [13]:
X_train.head(2)

Unnamed: 0,Total_Stops,Journey_Date,Journey_Month,Journey_Year,Dep_Hour,Dep_Min,Arrival_Hour,Arrival_Min,Duration_hours,Duration_mins,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
508,1,21,5,2019,20,20,18,50,22,30,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
3071,1,24,3,2019,6,35,14,25,7,50,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [24]:
X_train.drop(['Journey_Year'],axis=1,inplace=True)
X_test.drop(['Journey_Year'],axis=1,inplace=True)

## 1. Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
l_reg = LinearRegression()
l_reg.fit(X_train,y_train)

LinearRegression()

In [18]:
print('The coefficient of determination R^2 for train set is: {}'.format(l_reg.score(X_train,y_train)))

The coefficient of determination R^2 for train set is: 0.6322112072126629


In [25]:
print('The coefficient of determination R^2 for test set is {}'.format(l_reg.score(X_test,y_test)))

The coefficient of determination R^2 for test set is 0.5888639023134974


In [29]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(l_reg,X,y,cv=5)
print('The mean cross validation score is:',score.mean())

The mean cross validation score is: 0.6199027318916419


In [30]:
#finding out the predicted values for y
y_pred = l_reg.predict(X_test)

In [34]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

In [85]:
print('MAE: ', mae(y_test,y_pred))
print('MSE: ', mse(y_test,y_pred))
print('RMSE: ', np.sqrt(mae(y_test,y_pred)))
print('R2 Score:',r2_score(y_test,y_pred))

MAE:  2048.2726266051586
MSE:  9646558.195863314
RMSE:  45.257846022597654
R2 Score: 0.5888639023134974


MAE denotes that our model's price is 2048.2726 re off.

## 2. Ridge Regression

In [37]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [38]:
ri_reg = Ridge()

In [44]:
parameters = {'alpha':[0.4,0.6,0.004,0.003,0.0004,0.07,3,0.005,0.08,0.8,0.9,0.09]}
ri_model = GridSearchCV(estimator=ri_reg,param_grid=parameters,cv=5,scoring='neg_mean_squared_error')

In [45]:
ri_model.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [0.4, 0.6, 0.004, 0.003, 0.0004, 0.07, 3,
                                   0.005, 0.08, 0.8, 0.9, 0.09]},
             scoring='neg_mean_squared_error')

In [46]:
print(ri_model.best_params_)
print(ri_model.best_score_)

{'alpha': 0.07}
-7697061.793377249


In [47]:
print('MSE for train set is: {}'.format(ri_model.score(X_train,y_train)))
print('MSE for train set is: {}'.format(ri_model.score(X_test,y_test)))

MSE for train set is: -7617089.372509167
MSE for train set is: -9622355.585410964


In [48]:
rid_pred = ri_model.predict(X_test)

In [84]:
print('MAE: ', mae(y_test,rid_pred))
print('MSE: ', mse(y_test,rid_pred))
print('RMSE: ', np.sqrt(mae(y_test,rid_pred)))
print('R2 Score:',r2_score(y_test,rid_pred))

MAE:  2047.7842292100258
MSE:  9622355.585410964
RMSE:  45.252449980194726
R2 Score: 0.5898954170375232


## 3. Lasso Regression

In [51]:
from sklearn.linear_model import Lasso

In [52]:
la_reg = Lasso()

In [54]:
para =  {'alpha' : [1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40]}
la_model = GridSearchCV(estimator=la_reg,param_grid=para,cv=5,scoring='neg_mean_squared_error')

In [55]:
la_model.fit(X_train,y_train)

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20, 30, 35, 40]},
             scoring='neg_mean_squared_error')

In [57]:
la_model.best_params_ , la_model.best_score_

({'alpha': 0.01}, -7697430.20685411)

In [58]:
la_pred = la_model.predict(X_test)

In [83]:
print('MAE: ', mae(y_test,la_pred))
print('MSE: ',mse(y_test,la_pred))
print('RMSE: ', np.sqrt(mae(y_test,la_pred)))
print('R2 Score:',r2_score(y_test,la_pred))

MAE:  2048.27332404149
MSE:  9645996.250656985
RMSE:  45.257853727739786
R2 Score: 0.5888878524058052


## 3. Decision Tree Regressor

In [61]:
from sklearn.tree import DecisionTreeRegressor

In [65]:
dt_reg = DecisionTreeRegressor()

In [66]:
dt_reg.fit(X_train,y_train)

DecisionTreeRegressor()

In [67]:
dt_reg.score(X_train,y_train),dt_reg.score(X_test,y_test)

(0.9711683243930616, 0.6951509138655556)

In [68]:
score = cross_val_score(dt_reg,X_train,y_train,cv=5)

In [69]:
score.mean()

0.7057967560530145

In [70]:
dt_pred = dt_reg.predict(X_test)

In [82]:
print('MAE ', mae(y_test,dt_pred))
print('MSE ', mse(y_test,dt_pred))
print('RMSE ', np.sqrt(mae(y_test,dt_pred)))
print('R2 Score:',r2_score(y_test,dt_pred))

MAE  1435.194423646857
MSE  7152727.446943925
RMSE  37.88395997842434
R2 Score: 0.6951509138655556


## 4. Random Forest Classifier

In [73]:
from sklearn.ensemble import RandomForestClassifier

In [74]:
ref_reg = RandomForestClassifier()

In [75]:
ref_reg.fit(X_train,y_train)

RandomForestClassifier()

In [77]:
ref_reg.score(X_train,y_train),ref_reg.score(X_test,y_test)

(0.8835576360444705, 0.3420683200748713)

In [78]:
ref_pred = ref_reg.predict(X_test)

In [81]:
print('MAE ', mae(y_test,ref_pred))
print('MSE ', mse(y_test,ref_pred))
print('RMSE ', np.sqrt(mae(y_test,ref_pred)))
print('R2 Score:',r2_score(y_test,ref_pred))

MAE  1582.7983153954142
MSE  9979095.568554047
RMSE  39.78439788906468
R2 Score: 0.5746911668189228


In [86]:
prediction = pd.DataFrame(dt_pred, columns=['predictions']).to_excel('Test_set.xlsx')