In [2432]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV , cross_val_score,RandomizedSearchCV
from lazypredict.Supervised import LazyRegressor
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge, HuberRegressor, LogisticRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2433]:
df = pd.read_csv('Final.csv')
df.shape

(9650, 21)

In [2434]:
df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,...,Day,Month,Year,Dep_Hr,Dep_Min,Arr_Hr,Arr_Min,Duration_Hr,Duration_Min,Duration_bool
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No Info,...,24,3,2019,22,20,1,10,2,50.0,170.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No Info,...,1,5,2019,5,50,13,15,7,25.0,445.0


In [2435]:
df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'Day', 'Month', 'Year', 'Dep_Hr', 'Dep_Min',
       'Arr_Hr', 'Arr_Min', 'Duration_Hr', 'Duration_Min', 'Duration_bool'],
      dtype='object')

In [2436]:
df1 = df[['Airline', 'Source', 'Destination', 'Total_Stops',
         'Additional_Info', 'Price', 'Day', 'Month', 'Duration_bool']]



In [2437]:

df1.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day,Month,Duration_bool
0,IndiGo,Banglore,New Delhi,0,No Info,3897,24,3,170.0
1,Air India,Kolkata,Banglore,2,No Info,7662,1,5,445.0
2,IndiGo,Kolkata,Banglore,1,No Info,6218,12,5,325.0
3,IndiGo,Banglore,New Delhi,1,No Info,13302,1,3,285.0
4,SpiceJet,Kolkata,Banglore,0,No Info,3873,24,6,145.0


In [2438]:
df1

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day,Month,Duration_bool
0,IndiGo,Banglore,New Delhi,0,No Info,3897,24,3,170.00
1,Air India,Kolkata,Banglore,2,No Info,7662,1,5,445.00
2,IndiGo,Kolkata,Banglore,1,No Info,6218,12,5,325.00
3,IndiGo,Banglore,New Delhi,1,No Info,13302,1,3,285.00
4,SpiceJet,Kolkata,Banglore,0,No Info,3873,24,6,145.00
...,...,...,...,...,...,...,...,...,...
9645,SpiceJet,Banglore,Delhi,0,No check-in baggage included,3257,21,5,160.00
9646,Air Asia,Kolkata,Banglore,0,No Info,4107,9,4,150.00
9647,Air India,Kolkata,Banglore,0,No Info,4145,27,4,155.00
9648,Vistara,Banglore,New Delhi,0,No Info,12648,1,3,160.00


In [2439]:
df1 = df1.rename(columns={'Duration_bool': 'Duration'})

In [2440]:
df1.columns

Index(['Airline', 'Source', 'Destination', 'Total_Stops', 'Additional_Info',
       'Price', 'Day', 'Month', 'Duration'],
      dtype='object')

In [2441]:
df1.isnull().any().any()

False

In [2442]:
df1['Additional_Info'].unique()

array(['No Info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [2443]:
df1['Additional_Info'] = df1['Additional_Info'].map({'1 Long layover':0,
                                                     '2 Long layover':1,
                                                     'Business class':2,
                                                     'Change airports':3,
                                                     'In-flight meal not included':4,
                                                     'No check-in baggage included':5,
                                                     'No Info':6,
                                                     'Red-eye flight':7,
                                                     '1 Short layover':8,
                                                     }
)                                                    
                                                     
                                                     
                                                     
                                                     
                                                     
                

In [2444]:
df1['Additional_Info'].unique()

array([6, 4, 5, 8, 0, 3, 2, 7, 1], dtype=int64)

In [2445]:
dummies = pd.get_dummies(df1[['Airline', 'Source', 'Destination']])

In [2446]:
df2 = pd.concat([df1,dummies], axis=1)
df2.shape


(9650, 32)

In [2447]:
df2 = df2.drop(['Airline', 'Source', 'Destination'], axis=1)
df2.shape

(9650, 29)

In [2448]:
df2.columns

Index(['Total_Stops', 'Additional_Info', 'Price', 'Day', 'Month', 'Duration',
       'Airline_Air Asia', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Banglore', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
       'Source_Mumbai', 'Destination_Banglore', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi'],
      dtype='object')

In [2449]:
df2.isnull().any().any()

False

In [2450]:
df2.shape

(9650, 29)

In [2451]:
df2.dropna()

Unnamed: 0,Total_Stops,Additional_Info,Price,Day,Month,Duration,Airline_Air Asia,Airline_Air India,Airline_GoAir,Airline_IndiGo,...,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,0,6,3897,24,3,170.00,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,2,6,7662,1,5,445.00,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,6,6218,12,5,325.00,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0
3,1,6,13302,1,3,285.00,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,6,3873,24,6,145.00,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9645,0,5,3257,21,5,160.00,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9646,0,6,4107,9,4,150.00,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
9647,0,6,4145,27,4,155.00,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
9648,0,6,12648,1,3,160.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [2452]:
X=df2.drop('Price',axis=1)
X.dropna()
X.shape

(9650, 28)

In [2453]:

y=df2['Price']
y.dropna()

0        3897
1        7662
2        6218
3       13302
4        3873
        ...  
9645     3257
9646     4107
9647     4145
9648    12648
9649    11753
Name: Price, Length: 9650, dtype: int64

In [2454]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6755, 28), (2895, 28), (6755,), (2895,))

In [2456]:
models = [['LinearRegression : ', LinearRegression()],
          ['ElasticNet :', ElasticNet()],
          ['Lasso : ', Lasso()],
          ['Ridge : ', Ridge()],
          ['KNeighborsRegressor : ', KNeighborsRegressor()],
          ['DecisionTreeRegressor : ', DecisionTreeRegressor()],
          ['RandomForestRegressor : ', RandomForestRegressor()],
          ['SVR : ', SVR()],
          ['AdaBoostRegressor : ', AdaBoostRegressor()],
          ['GradientBoostingRegressor : ', GradientBoostingRegressor()],
          ['ExtraTreeRegressor : ', ExtraTreeRegressor()],
          ['HuberRegressor : ', HuberRegressor()],
          ['XGBRegressor : ', XGBRegressor()],
          ['BayesianRidge : ', BayesianRidge()]]

In [2458]:
for name, model in models:
    model=model
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name, (np.sqrt(mean_squared_error(y_test, predictions))))

LinearRegression :  2688.242828471959
ElasticNet : 3364.226747874212
Lasso :  2671.204538060698
Ridge :  2632.3990250854354
KNeighborsRegressor :  3239.943511657085
DecisionTreeRegressor :  2026.10326229598
RandomForestRegressor :  1661.6022442641688
SVR :  4246.399013841906
AdaBoostRegressor :  3298.0610593706474
GradientBoostingRegressor :  1901.5837438720062
ExtraTreeRegressor :  2360.218484246037
HuberRegressor :  2889.194733062066
XGBRegressor :  1551.8431382586082
BayesianRidge :  2683.315621851587


In [2459]:
algorithms = {
    'XGBRegressor' : {
        'model' : XGBRegressor(),
        'param' : {
            'learning_rate' : [0.5, 0.8, 0.1, 0.20, 0.25, 0.30],
            'max_depth' : [3, 5, 7, 9, 11, 13, 15],
            'gamma' : [0.1,0.2, 0.3, 0.4, 0.5],
            'min_child_weight' : [1, 3, 5, 7, 9],
            'colsample_bytree' : [0.5, 0.8, 0.1, 0.20, 0.25, 0.30]
        }
    },
    'RandomForestRegressor' : {
        'model' : RandomForestRegressor(),
        'param' : {
            'n_estimators' : [300, 500, 700, 1000, 2100],
            'max_depth' : [3, 5, 7, 9, 11, 13, 15],
            'max_features' : ["auto", "sqrt", "log2"],
            'min_samples_split' : [2, 4, 6, 8]
        }
    },
    'GradientBoostingRegressor' : {
        'model' : GradientBoostingRegressor(),
        'param' : {
            'learning_rate' : [0.5, 0.8, 0.1, 0.20, 0.25, 0.30],
            'n_estimators' : [300, 500, 700, 1000, 2100],
            'criterion' : ['friedman_mse', 'mse']
        }
    }
}

In [2460]:
score = []

for name, mp in algorithms.items() :
    rs = RandomizedSearchCV(estimator = mp['model'], param_distributions = mp['param'], cv = 10, n_jobs=-1, verbose=3)
    rs.fit(X_train, y_train)
    score.append({
        'model': name,
        'score' : rs.best_score_,
        'params' : rs.best_params_
    })

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [2461]:
final = pd.DataFrame(score, columns=['model', 'score', 'params'])
final

Unnamed: 0,model,score,params
0,XGBRegressor,0.85,"{'min_child_weight': 1, 'max_depth': 13, 'lear..."
1,RandomForestRegressor,0.86,"{'n_estimators': 300, 'min_samples_split': 4, ..."
2,GradientBoostingRegressor,0.87,"{'n_estimators': 1000, 'learning_rate': 0.2, '..."


In [2462]:
final['params'][2]

{'n_estimators': 1000, 'learning_rate': 0.2, 'criterion': 'friedman_mse'}

In [2463]:
regressor = GradientBoostingRegressor(n_estimators = 500, learning_rate = 0.3, criterion = 'friedman_mse')
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
print('RMSE : {}'.format(np.sqrt(mean_squared_error(y_test, prediction))))

RMSE : 1616.4870207513604


In [2464]:
regressor.score(X_train, y_train), regressor.score(X_test, y_test)

(0.917559308371242, 0.8741844247901326)

In [2465]:
prediction[0]

4708.77230863138

In [2466]:
df2['Price'][0]

3897

In [2467]:
print('MAE:', mean_absolute_error(y_test, prediction))
print('MSE:', mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(mean_squared_error(y_test, prediction)))

MAE: 956.7978598218374
MSE: 2613030.288257609
RMSE: 1616.4870207513604


In [2468]:
import pickle
file = open('final_model.pkl', 'wb')
pickle.dump(regressor, file)

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))