In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder,StandardScaler

In [5]:
train_data = pd.read_excel('Data_Train.xlsx')
test_data = pd.read_excel('Test_set.xlsx')
train_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [6]:
test_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [7]:
test_data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
dtype: int64

In [8]:
# imputing most frequent value
train_data['Total_Stops']=train_data['Total_Stops'].fillna('1 stop')

In [9]:
# making non stop to 0 stop so further we can use it as integer after splitting it
train_data['Total_Stops']=train_data['Total_Stops'].replace('non-stop','0 stop')
test_data['Total_Stops']=test_data['Total_Stops'].replace('non-stop','0 stop')

In [10]:
train_data['Total_Stop'] = train_data['Total_Stops'].str.split(' ').str[0]
train_data['Total_Stop'] = train_data['Total_Stop'].astype(int)
test_data['Total_Stop'] = test_data['Total_Stops'].str.split(' ').str[0]
test_data['Total_Stop'] = test_data['Total_Stop'].astype(int)

In [12]:
# imputing most frequent value
train_data['Route'] = train_data['Route'].fillna('DEL → BOM → COK')

In [13]:
train_data['Date_of_Journey'] = pd.to_datetime(train_data['Date_of_Journey'])
train_data['Month_of_Journey'] = train_data['Date_of_Journey'].dt.month
train_data['Day_of_Journey'] = train_data['Date_of_Journey'].dt.day
test_data['Date_of_Journey'] = pd.to_datetime(test_data['Date_of_Journey'])
test_data['Month_of_Journey'] = test_data['Date_of_Journey'].dt.month
test_data['Day_of_Journey'] = test_data['Date_of_Journey'].dt.day

In [14]:
def duration_minutes(a):
    a = a.split()
    a[0] = re.sub("\D", "", a[0])
    a[0] = int(a[0])*60
    if len(a)==1:
        a.append('0')
    a[1] = re.sub("\D",'', a[1])
    return a[0]+int(a[1])

In [15]:
train_data['Duration'] = train_data['Duration'].apply(duration_minutes)       
test_data['Duration'] = test_data['Duration'].apply(duration_minutes)       

In [16]:
# dept and arrival hours and minutes
train_data['Dep_Time'] = pd.to_datetime(train_data['Dep_Time'])
train_data['Arrival_Time'] = pd.to_datetime(train_data['Arrival_Time'])
train_data['Dep_Time_hours'] = train_data['Dep_Time'].dt.hour
train_data['Dep_Time_minutes'] = train_data['Dep_Time'].dt.minute
train_data['Arr_Time_hours'] = train_data['Arrival_Time'].dt.hour
train_data['Arr_Time_minutes'] = train_data['Arrival_Time'].dt.minute

test_data['Dep_Time'] = pd.to_datetime(test_data['Dep_Time'])
test_data['Arrival_Time'] = pd.to_datetime(test_data['Arrival_Time'])
test_data['Dep_Time_hours'] = test_data['Dep_Time'].dt.hour
test_data['Dep_Time_minutes'] = test_data['Dep_Time'].dt.minute
test_data['Arr_Time_hours'] = test_data['Arrival_Time'].dt.hour
test_data['Arr_Time_minutes'] = test_data['Arrival_Time'].dt.minute

In [17]:
train_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Total_Stop,Month_of_Journey,Day_of_Journey,Dep_Time_hours,Dep_Time_minutes,Arr_Time_hours,Arr_Time_minutes
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,2020-08-20 22:20:00,2020-03-22 01:10:00,170,0 stop,No info,3897,0,3,24,22,20,1,10
1,Air India,2019-01-05,Kolkata,Banglore,CCU → IXR → BBI → BLR,2020-08-20 05:50:00,2020-08-20 13:15:00,445,2 stops,No info,7662,2,1,5,5,50,13,15
2,Jet Airways,2019-09-06,Delhi,Cochin,DEL → LKO → BOM → COK,2020-08-20 09:25:00,2020-06-10 04:25:00,1140,2 stops,No info,13882,2,9,6,9,25,4,25
3,IndiGo,2019-12-05,Kolkata,Banglore,CCU → NAG → BLR,2020-08-20 18:05:00,2020-08-20 23:30:00,325,1 stop,No info,6218,1,12,5,18,5,23,30
4,IndiGo,2019-01-03,Banglore,New Delhi,BLR → NAG → DEL,2020-08-20 16:50:00,2020-08-20 21:35:00,285,1 stop,No info,13302,1,1,3,16,50,21,35


In [18]:
test_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Total_Stop,Month_of_Journey,Day_of_Journey,Dep_Time_hours,Dep_Time_minutes,Arr_Time_hours,Arr_Time_minutes
0,Jet Airways,2019-06-06,Delhi,Cochin,DEL → BOM → COK,2020-08-20 17:30:00,2020-06-07 04:25:00,655,1 stop,No info,1,6,6,17,30,4,25
1,IndiGo,2019-12-05,Kolkata,Banglore,CCU → MAA → BLR,2020-08-20 06:20:00,2020-08-20 10:20:00,240,1 stop,No info,1,12,5,6,20,10,20
2,Jet Airways,2019-05-21,Delhi,Cochin,DEL → BOM → COK,2020-08-20 19:15:00,2020-05-22 19:00:00,1425,1 stop,In-flight meal not included,1,5,21,19,15,19,0
3,Multiple carriers,2019-05-21,Delhi,Cochin,DEL → BOM → COK,2020-08-20 08:00:00,2020-08-20 21:00:00,780,1 stop,No info,1,5,21,8,0,21,0
4,Air Asia,2019-06-24,Banglore,Delhi,BLR → DEL,2020-08-20 23:55:00,2020-06-25 02:45:00,170,0 stop,No info,0,6,24,23,55,2,45


In [19]:
train_data.drop(columns=['Date_of_Journey','Dep_Time','Arrival_Time','Total_Stops'],inplace=True)
test_data.drop(columns=['Date_of_Journey','Dep_Time','Arrival_Time','Total_Stops'],inplace=True)

In [20]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Additional_Info,Price,Total_Stop,Month_of_Journey,Day_of_Journey,Dep_Time_hours,Dep_Time_minutes,Arr_Time_hours,Arr_Time_minutes
0,IndiGo,Banglore,New Delhi,BLR → DEL,170,No info,3897,0,3,24,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,445,No info,7662,2,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,1140,No info,13882,2,9,6,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,325,No info,6218,1,12,5,18,5,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,285,No info,13302,1,1,3,16,50,21,35


In [21]:
test_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Additional_Info,Total_Stop,Month_of_Journey,Day_of_Journey,Dep_Time_hours,Dep_Time_minutes,Arr_Time_hours,Arr_Time_minutes
0,Jet Airways,Delhi,Cochin,DEL → BOM → COK,655,No info,1,6,6,17,30,4,25
1,IndiGo,Kolkata,Banglore,CCU → MAA → BLR,240,No info,1,12,5,6,20,10,20
2,Jet Airways,Delhi,Cochin,DEL → BOM → COK,1425,In-flight meal not included,1,5,21,19,15,19,0
3,Multiple carriers,Delhi,Cochin,DEL → BOM → COK,780,No info,1,5,21,8,0,21,0
4,Air Asia,Banglore,Delhi,BLR → DEL,170,No info,0,6,24,23,55,2,45


In [22]:
y = train_data['Price']
y

0         3897
1         7662
2        13882
3         6218
4        13302
5         3873
6        11087
7        22270
8        11087
9         8625
10        8907
11        4174
12        4667
13        9663
14        4804
15       14011
16        5830
17       10262
18       13381
19       12898
20       19495
21        6955
22        3943
23        4823
24        7757
25       13292
26        8238
27        7682
28        4668
29        3687
         ...  
10653    14388
10654     4319
10655     5678
10656     5613
10657     9663
10658     5769
10659     4668
10660     4878
10661     8372
10662    12352
10663    11733
10664     4823
10665     4804
10666    10262
10667    11087
10668    21219
10669     9929
10670    11411
10671     3100
10672    11150
10673    16704
10674    11087
10675     3100
10676     9794
10677     3257
10678     4107
10679     4145
10680     7229
10681    12648
10682    11753
Name: Price, Length: 10683, dtype: int64

In [23]:
train_data.drop('Price',axis=1,inplace=True)

In [24]:
ohe = OneHotEncoder(handle_unknown='ignore')


In [25]:
X_cat_train = train_data[['Airline','Source','Destination','Route','Additional_Info']]
X_cat_test = test_data[['Airline','Source','Destination','Route','Additional_Info']]

In [26]:
X_numeric_train = train_data[['Duration','Total_Stop','Month_of_Journey','Day_of_Journey','Dep_Time_hours','Dep_Time_minutes','Arr_Time_hours','Arr_Time_minutes']]
X_numeric_test = test_data[['Duration','Total_Stop','Month_of_Journey','Day_of_Journey','Dep_Time_hours','Dep_Time_minutes','Arr_Time_hours','Arr_Time_minutes']]

In [27]:
# one hot encoded features of categorical data
train_cat_ohe = ohe.fit_transform(X_cat_train).toarray()
test_cat_ohe = ohe.transform(X_cat_test).toarray()

In [30]:
# standardize numeric dataframe
std = StandardScaler()
train_numeric_std = std.fit_transform(X_numeric_train)
test_numeric_std = std.transform(X_numeric_test)

In [31]:
# merge numerical features and one hot encoded feature
X_train_final = np.hstack((train_cat_ohe,train_numeric_std))
X_test_final = np.hstack((test_cat_ohe,test_numeric_std))

In [32]:
print('Shape of final Trian ',X_train_final.shape)
print('Shape of final Test ',X_test_final.shape)

Shape of final Trian  (10683, 169)
Shape of final Test  (2671, 169)


# Ridge Regression

In [34]:
param_grid = {'alpha':[0.001,0.01,0.1,1,10,100]}
ridgemodel = Ridge()
ridge_regressor = GridSearchCV(ridgemodel,param_grid)
ridge_regressor.fit(X_train_final,y)
predict_ridgemodel_y = ridge_regressor.predict(X_test_final)
print('Train_score',ridge_regressor.score(X_train_final,y))
print('best_parameters',ridge_regressor.best_params_)

Train_score 0.7479791410275867
best_parameters {'alpha': 0.1}


# Lasso Regression

In [36]:
param_grid = {'alpha':[0.001,0.01,0.1,1,10,100]}
lassomodel = Lasso()
lasso_regressor = GridSearchCV(lassomodel,param_grid)
lasso_regressor.fit(X_train_final,y)
print('Train_score',lasso_regressor.score(X_train_final,y))
print('best_parameters',lasso_regressor.best_params_)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Train_score 0.7478114424061106
best_parameters {'alpha': 0.1}


  positive)


# KNN 

In [37]:
param_grid = {'n_neighbors':[5,10,15,25,20]}
knnmodel = KNeighborsRegressor()
knn_regressor = GridSearchCV(knnmodel,param_grid)
knn_regressor.fit(X_train_final,y)
predict_knnmodel_y = knn_regressor.predict(X_test_final)
print('Train_score',knn_regressor.score(X_train_final,y))
print('best_parameters',knn_regressor.best_params_)

Train_score 0.8741713043251492
best_parameters {'n_neighbors': 5}


In [38]:
type(predict_knnmodel_y)

numpy.ndarray

In [46]:
np.savetxt('predict_text1.csv',predict_knnmodel_y, delimiter=' ', newline='\n', header='Price', footer='', comments='# ', encoding=None)

In [44]:
predict_knnmodel_y

array([15848. ,  5781.4, 11738.6, ..., 15092. , 11836.8,  8443.6])

In [47]:
param_grid = {'alpha':[0.0001,0.001,0.01,0.1,1],
             'epsilon':[0.01,0.1,1]}
sgdmodel = SGDRegressor()
sgd_regressor = GridSearchCV(sgdmodel,param_grid)
sgd_regressor.fit(X_train_final,y)
predict_knnmodel_y = knn_regressor.predict(X_test_final)
print('Train_score',sgd_regressor.score(X_train_final,y))
print('best_parameters',sgd_regressor.best_params_)

Train_score 0.7307961237714963
best_parameters {'alpha': 0.0001, 'epsilon': 0.01}
