In [105]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import warnings
import os
import time
from sklearn.metrics import mean_squared_error
import gc
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None 
gc.enable()
%matplotlib inline

In [80]:
#Import data
train = pd.read_excel('E://02.Analytics Vidhya//Flight_Ticket_Participant_Datasets//Data_train.xlsx')
train.head(5)
test = pd.read_excel('E://02.Analytics Vidhya//Flight_Ticket_Participant_Datasets//Test_set.xlsx')
test.head(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [81]:
#Removing data with null values
train = train.dropna()
train.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

In [82]:
train['Source'].value_counts()

Delhi       4536
Kolkata     2871
Banglore    2197
Mumbai       697
Chennai      381
Name: Source, dtype: int64

In [83]:
#Breaking down date into day, month and dropping date feature.
for df in [train, test]:
    df['Date'] = pd.to_datetime(df['Date_of_Journey'])
    df['Day of week'] = df['Date'].dt.dayofweek
    df['Day of month'] = df['Date'].dt.day
    df['Date month'] = df['Date'].dt.month
    df.drop(labels = 'Date_of_Journey', axis = 1, inplace= True)

In [84]:
train.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Day of week,Day of month,Date month
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,2019-03-24,6,24,3
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,2019-01-05,5,5,1
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,2019-09-06,4,6,9
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,2019-12-05,3,5,12
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,2019-01-03,3,3,1


In [85]:
#To identify which part of the day, flight has departured.

def departure(x):
    x = x.strip()
    #print(x)
    time = (int)(x.split(':')[0])
    if ((time >= 4) & (time < 8)):
        r = 'Early morning'
    elif ((time >= 8) & (time < 12)):
        r = 'Morning'
    elif ((time >= 12) & (time < 16)):
        r = 'Noon'
    elif ((time >= 16) & (time < 20)):
        r = 'Evening'
    elif (time >= 20):
        r = 'Night'
    elif ((time >= 0) & (time < 4)):
        r = 'Mid Night'
    
    return r

for df in [train, test]:
    df['Dep of day'] = df['Dep_Time'].apply(departure)
    df.drop(labels = 'Arrival_Time', axis = 1, inplace = True)
    
    

In [86]:
#Convert total travel duration into minutes
def duration(x):
    
    a = x.strip()
    t = x.split(' ')
    h = t[0]
    hrs = (int)(h[:-1])*60
    #print(hrs)
    if(len(t)==2):
        m = t[1]
        mins = (int)(m[:-1])
        hrs = hrs + mins
    val = str(hrs)
    
    return val

for df in [train, test]:
    df['Total Travel time'] = df['Duration'].apply(duration)
    df.drop(labels = 'Duration', axis = 1, inplace = True)

In [87]:
train['Total_Stops'].value_counts()

1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64

In [88]:
for df in [train, test]:
    for i in range(df.shape[0]):
        if(df.iloc[i]['Additional_Info'] == 'No Info'):
            df.iloc[i]['Additional_Info'] = 'No info'

In [89]:
train['Additional_Info'].value_counts()

No info                         8344
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
2 Long layover                     1
Red-eye flight                     1
1 Short layover                    1
Name: Additional_Info, dtype: int64

In [91]:
for df in [train, test]:
    df.drop(labels = 'Route', axis = 1, inplace = True)
    df.drop(labels = 'Date', axis = 1, inplace = True)
    df.drop(labels = 'Dep_Time', axis = 1, inplace =True)
    df['Total Travel time'] = df['Total Travel time'].astype(int)

In [92]:
train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day of week,Day of month,Date month,Dep of day,Total Travel time
0,IndiGo,Banglore,New Delhi,non-stop,No info,3897,6,24,3,Night,170
1,Air India,Kolkata,Banglore,2 stops,No info,7662,5,5,1,Early morning,445
2,Jet Airways,Delhi,Cochin,2 stops,No info,13882,4,6,9,Morning,1140
3,IndiGo,Kolkata,Banglore,1 stop,No info,6218,3,5,12,Evening,325
4,IndiGo,Banglore,New Delhi,1 stop,No info,13302,3,3,1,Evening,285


In [93]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10682 entries, 0 to 10682
Data columns (total 11 columns):
Airline              10682 non-null object
Source               10682 non-null object
Destination          10682 non-null object
Total_Stops          10682 non-null object
Additional_Info      10682 non-null object
Price                10682 non-null int64
Day of week          10682 non-null int64
Day of month         10682 non-null int64
Date month           10682 non-null int64
Dep of day           10682 non-null object
Total Travel time    10682 non-null int32
dtypes: int32(1), int64(4), object(6)
memory usage: 959.7+ KB


In [94]:
#excluded features

excluded = ['Price ' ]

categorical_features = [f for f in train.columns
                       if (f not in excluded) & (train[f].dtype == 'object')]

In [95]:
#Factorize categorical features

for f in categorical_features:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

In [96]:
train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day of week,Day of month,Date month,Dep of day,Total Travel time
0,0,0,0,0,0,3897,6,24,3,0,170
1,1,1,1,1,0,7662,5,5,1,1,445
2,2,2,2,1,0,13882,4,6,9,2,1140
3,0,1,1,2,0,6218,3,5,12,3,325
4,0,0,0,2,0,13302,3,3,1,3,285


In [98]:
Y = train['Price']
train.drop(labels = 'Price', axis = 1, inplace = True)

KeyError: 'Price'

In [99]:
X = train

In [101]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.20,random_state = 1000)

In [110]:
from sklearn.ensemble import RandomForestRegressor
rr=RandomForestRegressor(n_estimators=400,random_state=0)
rr.fit(X_train,Y_train)

y_pred=rr.predict(X_test)

def rmsle(Y,YH):
  sum=0
  for y,yh in zip(Y,YH):
    sum+=(np.log(y)-np.log(yh))**2
  return (sum/(Y.shape[0]))**0.5

x = 1-rmsle(Y_test,y_pred)

In [104]:
yy_pred=rr.predict(test)
yy_pred=yy_pred.astype(int)

submission=pd.DataFrame({'Price':yy_pred})

submission.to_excel("Flight_Price_Pred.xlsx",index=False)

In [125]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, X.values, Y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [117]:
#Linear model training and prediction
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

#Lasso

lasso = make_pipeline(RobustScaler(), Lasso(alpha  = 0.0005, random_state = 1))

Enet = make_pipeline(RobustScaler(), ElasticNet(alpha = 0.0005, l1_ratio = 0.9, random_state = 3))

In [127]:
l_score = rmsle_cv(lasso)
print(l_score.mean())

3634.0521535864864


In [129]:
enet_score = rmsle_cv(Enet)
print(enet_score.mean())

3634.0508587136524
