In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
%matplotlib inline
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_excel('Airline_Train.xlsx')
df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


In [3]:
df = df.drop(['Date_of_Journey', 'Route','Dep_Time' ,'Arrival_Time','Additional_Info'],axis=1)
df.columns

Index(['Airline', 'Source', 'Destination', 'Duration', 'Total_Stops', 'Price'], dtype='object')

In [4]:
df['Duration1'] = (pd.to_timedelta(df['Duration']).dt.seconds // 60).astype(float)
df = df.drop(['Duration'], axis=1)
df

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Duration1
0,IndiGo,Banglore,New Delhi,non-stop,3897,170.0
1,Air India,Kolkata,Banglore,2 stops,7662,445.0
2,Jet Airways,Delhi,Cochin,2 stops,13882,1140.0
3,IndiGo,Kolkata,Banglore,1 stop,6218,325.0
4,IndiGo,Banglore,New Delhi,1 stop,13302,285.0
...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,non-stop,4107,150.0
10679,Air India,Kolkata,Banglore,non-stop,4145,155.0
10680,Jet Airways,Banglore,Delhi,non-stop,7229,180.0
10681,Vistara,Banglore,New Delhi,non-stop,12648,160.0


In [5]:
df['Total_Stops1'] = df['Total_Stops'].str.replace('non-stop','0')
df['Total_Stops2'] = df['Total_Stops1'].str.replace('stops','')
df['Total_Stops3'] = df['Total_Stops2'].str.replace('stop','').astype(float)
df = df.drop(['Total_Stops2'], axis=1)
df = df.drop(['Total_Stops1'], axis=1)
df = df.drop(['Total_Stops'], axis=1)
df

Unnamed: 0,Airline,Source,Destination,Price,Duration1,Total_Stops3
0,IndiGo,Banglore,New Delhi,3897,170.0,0.0
1,Air India,Kolkata,Banglore,7662,445.0,2.0
2,Jet Airways,Delhi,Cochin,13882,1140.0,2.0
3,IndiGo,Kolkata,Banglore,6218,325.0,1.0
4,IndiGo,Banglore,New Delhi,13302,285.0,1.0
...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,4107,150.0,0.0
10679,Air India,Kolkata,Banglore,4145,155.0,0.0
10680,Jet Airways,Banglore,Delhi,7229,180.0,0.0
10681,Vistara,Banglore,New Delhi,12648,160.0,0.0


In [6]:
df.dtypes
df['Price1'] = df['Price']
df = df.drop(['Price'], axis=1)
df

Unnamed: 0,Airline,Source,Destination,Duration1,Total_Stops3,Price1
0,IndiGo,Banglore,New Delhi,170.0,0.0,3897
1,Air India,Kolkata,Banglore,445.0,2.0,7662
2,Jet Airways,Delhi,Cochin,1140.0,2.0,13882
3,IndiGo,Kolkata,Banglore,325.0,1.0,6218
4,IndiGo,Banglore,New Delhi,285.0,1.0,13302
...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,150.0,0.0,4107
10679,Air India,Kolkata,Banglore,155.0,0.0,4145
10680,Jet Airways,Banglore,Delhi,180.0,0.0,7229
10681,Vistara,Banglore,New Delhi,160.0,0.0,12648


In [7]:
df.shape

(10683, 6)

In [8]:
df['Airline'] = pd.get_dummies(df['Airline'])
df['Source'] = pd.get_dummies(df['Source'])
df['Destination'] = pd.get_dummies(df['Destination'])
df

Unnamed: 0,Airline,Source,Destination,Duration1,Total_Stops3,Price1
0,0,1,0,170.0,0.0,3897
1,0,0,1,445.0,2.0,7662
2,0,0,0,1140.0,2.0,13882
3,0,0,1,325.0,1.0,6218
4,0,1,0,285.0,1.0,13302
...,...,...,...,...,...,...
10678,1,0,1,150.0,0.0,4107
10679,0,0,1,155.0,0.0,4145
10680,0,1,0,180.0,0.0,7229
10681,0,1,0,160.0,0.0,12648


In [9]:
#def clean_dataset(df):
    #assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    #df.dropna(inplace=True)
    #indices_to_keep = df.isin([np.nan, np.inf, -np.inf]).any(1)
    #return df[indices_to_keep].astype(np.float64)

In [10]:
#clean_dataset(df)

Unnamed: 0,Airline,Source,Destination,Duration1,Total_Stops3,Price1


In [11]:
Xs = df.iloc[:, 0:-1]
X_names = Xs.columns
r_vals = {}
for x_name in X_names:
    r_val = df[[x_name, 'Price1']].corr().iloc[0,1]
    print("correlation between", x_name, 'and Price1 is:', r_val)
    r_vals[x_name] = r_val

correlation between Airline and Price1 is: -0.13305036752305496
correlation between Source and Price1 is: -0.11804430525100748
correlation between Destination and Price1 is: 0.00935757519566488
correlation between Duration1 and Price1 is: 0.4496765905893939
correlation between Total_Stops3 and Price1 is: 0.6038969088067389


In [12]:
train_df, test_df = train_test_split(df, test_size=0.2)
print(len(train_df), len(test_df))

8545 2137


In [13]:
Y_train = train_df['Price1']
Xs_train = train_df.iloc[:, 0:-1]

Y_test = test_df['Price1']
Xs_test = test_df.iloc[:, 0:-1]

print(Y_train.shape, Y_test.shape)
print(Xs_train.shape, Xs_test.shape)

(8545,) (2137,)
(8545, 5) (2137, 5)


In [14]:
regression_model = LinearRegression()
regression_model.fit(Xs_train, Y_train)

LinearRegression()

In [15]:
print('a0 =', regression_model.intercept_)

for i in range(len(Xs_train.columns)):
    print('a', i+1, ' = ', regression_model.coef_[i], '\t-->\t coef. of: ', Xs_train.columns[i], sep='')

a0 = 4824.612729122279
a1 = -2147.619013660415	-->	 coef. of: Airline
a2 = 1431.8944642021295	-->	 coef. of: Source
a3 = 269.7057373163865	-->	 coef. of: Destination
a4 = 1.7794578588001346	-->	 coef. of: Duration1
a5 = 3737.9318058689005	-->	 coef. of: Total_Stops3


In [72]:
Y_pred = regression_model.predict(Xs_test)
r2_val = r2_score(Y_test, Y_pred)
print('Accuracy of the model = %.2f' % (r2_val * 100), '%', sep='')

Accuracy of the model = 41.42%


In [73]:
Y_pred

array([10071.75265867, 10052.3374653 , 10313.19476861, ...,
        9453.20334064,  5434.34775465,  5253.97115466])

In [17]:
from mlxtend.evaluate import bias_variance_decomp

In [18]:
Xs_train = Xs_train.to_numpy()
Y_train = Y_train.to_numpy()
Xs_test = Xs_test.to_numpy()
Y_test = Y_test.to_numpy()

In [19]:
total_mse, bias, var = bias_variance_decomp(regression_model, Xs_train, Y_train, Xs_test, Y_test, loss='mse', num_rounds=200, random_seed=1)
print('MSE: %.3f' % total_mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)
print('Bias + Var.: %.3f' % (bias + var))

MSE: 12570981.901
Bias: 12562950.675
Variance: 8031.227
Bias + Var.: 12570981.901


In [68]:
model = DecisionTreeRegressor( max_depth=17, splitter='best')
model.fit(Xs_train, Y_train)

DecisionTreeRegressor(max_depth=17)

In [69]:
DT_accuracy = model.score(Xs_test,Y_test)
print("Accuracy = %.2f" % (DT_accuracy * 100), '%', sep='')

Accuracy = 50.51%


In [70]:
Y_test
Xs_test

array([[  0.,   0.,   1., 665.,   1.],
       [  0.,   0.,   0., 800.,   1.],
       [  0.,   0.,   1., 800.,   1.],
       ...,
       [  0.,   0.,   0., 465.,   1.],
       [  0.,   0.,   1., 150.,   0.],
       [  0.,   0.,   0., 195.,   0.]])