In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
clean_data = pd.read_csv('Clean_Dataset.csv')
clean_data.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [4]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


In [5]:
clean_data.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
clean_data.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


### Does prices vary with airlines?

In [7]:
clean_data['airline'].unique()

array(['SpiceJet', 'AirAsia', 'Vistara', 'GO_FIRST', 'Indigo',
       'Air_India'], dtype=object)

### How is the price affected when tickets are bought in just 1 or 2 days before departure?

In [10]:
clean_data[clean_data['days_left'] < 3].describe().price

count      5953.000000
mean      27421.169326
std       23236.028160
min        1977.000000
25%       11843.000000
50%       16739.000000
75%       43193.000000
max      116562.000000
Name: price, dtype: float64

### Does ticket price change based on the departure time and arrival time?

In [11]:
clean_data['departure_time'].unique()

array(['Evening', 'Early_Morning', 'Morning', 'Afternoon', 'Night',
       'Late_Night'], dtype=object)

In [12]:
clean_data['arrival_time'].unique()

array(['Night', 'Morning', 'Early_Morning', 'Afternoon', 'Evening',
       'Late_Night'], dtype=object)

### How the price changes with change in Source and Destination?

In [13]:
clean_data['source_city'].unique()

array(['Delhi', 'Mumbai', 'Bangalore', 'Kolkata', 'Hyderabad', 'Chennai'],
      dtype=object)

### How does the ticket price vary between Economy and Business class?

In [14]:
clean_data['class'].unique()

array(['Economy', 'Business'], dtype=object)

### Preparing data for model training

In [15]:
clean_data.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [16]:
clean_data.drop('flight', axis=1, inplace=True)

In [17]:
clean_data.head()

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [18]:
dummies_data = pd.get_dummies(clean_data[['airline','source_city','departure_time','stops','arrival_time','class', 'destination_city']])
dummies_data.head()

Unnamed: 0,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,source_city_Bangalore,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,...,arrival_time_Morning,arrival_time_Night,class_Business,class_Economy,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
0,0,0,0,0,1,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1


In [19]:
new_data = pd.concat([clean_data.drop(['airline','source_city','departure_time','stops','arrival_time','class', 'destination_city'], axis=1), dummies_data], axis=1)
new_data.head(10)

Unnamed: 0,duration,days_left,price,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,source_city_Bangalore,...,arrival_time_Morning,arrival_time_Night,class_Business,class_Economy,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
0,2.17,1,5953,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,1
1,2.33,1,5953,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
2,2.17,1,5956,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,2.25,1,5955,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,2.33,1,5955,0,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1
5,2.33,1,5955,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
6,2.08,1,6060,0,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1
7,2.17,1,6060,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
8,2.17,1,5954,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
9,2.25,1,5954,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [20]:
data = new_data.drop('price', axis=1)
labels = new_data['price']

In [21]:
data = np.array(data).astype('float32')
labels = np.array(labels).astype('float32')
# labels = labels.reshape(-1,1)

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, shuffle=True)

In [24]:
y_train.shape

(240122,)

In [25]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation:", scores.std())

In [26]:
#rmse score calculation
def rmse(y_pred, y_actual):
    return np.sqrt(np.sum((y_pred-y_actual)**2)/y_actual.shape[0])

## Linear Regression

In [27]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg.intercept_, lin_reg.coef_

(30553.332,
 array([ 4.29255562e+01, -1.30444824e+02, -1.71725391e+03, -1.60310156e+03,
        -2.51845703e+01,  4.07033691e+02,  5.86738770e+02,  2.35163232e+03,
         2.95926758e+02,  2.20585083e+02, -1.11696301e+03, -1.37950854e+03,
         1.89217651e+03,  8.77875977e+01, -7.69612061e+02,  4.05514221e+01,
        -3.23192139e+01,  7.71066895e+02,  9.69167480e+01, -1.06596985e+02,
         1.83598767e+03,  3.94123071e+03, -5.77734668e+03, -4.53267517e+02,
        -1.22139893e+03,  4.70435669e+02,  5.13846313e+02,  8.14111328e+00,
         6.82228027e+02,  2.24645508e+04, -2.24645508e+04,  3.65001221e+02,
         1.25488770e+02, -1.19823389e+03, -1.34707251e+03,  1.72312866e+03,
         3.31656982e+02], dtype=float32))

In [28]:
y_pred = lin_reg.predict(X_test)
print(rmse(y_pred, y_test))

6761.713696552156


### Linear Regression using CV

In [29]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)

In [30]:
display_scores(lin_rmse_scores)

Scores: [6766.16198446 6665.00082521 6744.85433497 6707.35983827 6803.145155
 6807.79935074 6814.55765256 6738.97291878 6707.19076812 6773.08910321]
Mean: 6752.813193131779
Standard Deviation: 46.980620104147235


## Decision Tree Regressor

In [31]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [32]:
tree_pred = tree_reg.predict(X_test)

In [33]:
from sklearn.metrics import mean_squared_error
tree_mse = np.sqrt(mean_squared_error(y_test, tree_pred))
tree_mse

3526.260777197037

In [34]:
tree_scores = cross_val_score(tree_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

Scores: [3564.1229768  3540.33511761 3403.40499306 3504.26593354 3387.17578409
 3581.48936835 3528.76208831 3379.83587881 3331.35165315 3523.36810313]
Mean: 3474.411189686574
Standard Deviation: 84.94964804110465


## Random Forest Regressor

In [35]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

RandomForestRegressor()

In [36]:
forest_pred = forest_reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, forest_pred))

2787.661874012399

In [None]:
forest_scores = cross_val_score(forest_reg,X_train, y_train, scoring="neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
forest_reg.score(X_test, y_test)

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:

model=VotingClassifier([('Linear Regression',linear_reg),('Decision Tree Regressor',tree_reg),('Random Forest Regressor',forest_reg)])
model.fit(X_train,y_train)
EMscore=model.score(X_test,y_test)

In [None]:
EMpredict=model.predict(X_test)