## Model Selection
cv, ensemble , hyperparameter tuning
pick best model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('featured_encoded_df.csv')

In [3]:
df.isnull().sum()

IndiGo               0
Jet Airways          0
Multiple carriers    0
SpiceJet             0
others               0
Chennai              0
Delhi                0
Delhi.1              0
Kolkata              0
Kolkata.1            0
Mumbai               0
Cochin               0
Delhi.2              0
Delhi.3              0
Hyderabad            0
Kolkata.2            0
Kolkata.3            0
New Delhi            0
Route 1              0
Route 2              0
Route 3              0
Route 4              0
Route 5              0
Total_Stops          0
journey_day          0
journey_month        0
dep_time_hour        0
dep_time_min         0
arrival_time_hour    0
arrival_time_min     0
Duration_hours       0
Duration_mins        0
Price                0
dtype: int64

In [4]:
df.head()

Unnamed: 0,IndiGo,Jet Airways,Multiple carriers,SpiceJet,others,Chennai,Delhi,Delhi.1,Kolkata,Kolkata.1,...,Total_Stops,journey_day,journey_month,dep_time_hour,dep_time_min,arrival_time_hour,arrival_time_min,Duration_hours,Duration_mins,Price
0,1,0,0,0,0,0,0,0,0,0,...,0,24,3,22,20,1,10,2,50,3897
1,0,0,0,0,0,0,0,0,1,0,...,2,5,1,5,50,13,15,7,25,7662
2,0,1,0,0,0,0,1,0,0,0,...,2,6,9,9,25,4,25,19,0,13882
3,1,0,0,0,0,0,0,0,1,0,...,1,5,12,18,5,23,30,5,25,6218
4,1,0,0,0,0,0,0,0,0,0,...,1,3,1,16,50,21,35,4,45,13302


In [5]:
x = df.drop("Price" , axis = 1)
y = df['Price']

## train test split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2 , random_state = 123)

## Models

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [9]:
from sklearn.metrics import r2_score

In [10]:
def predict(model):
    model_var = model()
    model_var.fit(x_train,y_train)
    y_pred = model_var.predict(x_test)
    print(f"Training score: {model_var.score(x_train , y_train)}")
    print(f"Testing score: {r2_score(y_pred,y_test)} ")

In [11]:
predict(LinearRegression)

Training score: 0.6426840138583916
Testing score: 0.448815427170313 


In [12]:
predict(KNeighborsRegressor)

Training score: 0.7944794174120825
Testing score: 0.5726622094674378 


In [13]:
predict(DecisionTreeRegressor)

Training score: 0.9611333614476625
Testing score: 0.7219589957708209 


In [14]:
predict(RandomForestRegressor)

Training score: 0.9498487280548251
Testing score: 0.8040470859719131 


### Bagging

In [15]:
from sklearn.ensemble import BaggingRegressor

In [16]:
bag = BaggingRegressor( base_estimator = LinearRegression() )
bag.fit(x_train , y_train)
y_pred = bag.predict(x_test)




In [17]:
bag.score(x_train , y_train)

0.6425953218523313

In [18]:
r2_score(y_test , y_pred)

0.6491572364947568

In [19]:
# bagging with randomforest 

In [20]:
bag = BaggingRegressor( base_estimator = RandomForestRegressor() )
bag.fit(x_train , y_train)
y_pred = bag.predict(x_test)

In [21]:
bag.score(x_train , y_train)

0.9289434624774052

In [22]:
r2_score(y_test , y_pred)

0.8450149492788659

### Boosting

#### GB

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

In [24]:
gr = GradientBoostingRegressor()

In [25]:
gr.fit(x_train , y_train)

In [26]:
y_pred = gr.predict(x_test)

In [27]:
gr.score(x_train ,y_train)

0.8024081795595147

In [28]:
r2_score(y_pred, y_test)

0.7202238634336253

#### AB

In [29]:
from sklearn.ensemble import AdaBoostRegressor

In [31]:
ab = AdaBoostRegressor(base_estimator= DecisionTreeRegressor())

In [32]:
ab.fit(x_train , y_train)

In [33]:
y_pred = ab.predict(x_test)

In [34]:
ab.score(x_train, y_train)

0.9434292943877968

In [35]:
r2_score(y_pred , y_test)

0.7820143445029277

## Voting

In [36]:
from sklearn.ensemble import VotingRegressor

In [37]:
vr = VotingRegressor(estimators = [ ('lr' , LinearRegression() ) , ('rf' , RandomForestRegressor()) , ('knnr' , KNeighborsRegressor()) ])

In [38]:
vr.fit(x_train , y_train)

In [39]:
y_pred = vr.predict(x_test)

In [40]:
vr.score(x_train , y_train)

0.8595073636702384

In [41]:
r2_score(y_pred , y_test)

0.6926353689952449

## Stacking

In [42]:
from sklearn.ensemble import StackingRegressor

In [43]:
sr = StackingRegressor(estimators = [ ('lr' , LinearRegression() ) , ('dr' , DecisionTreeRegressor) , ('knnr' , KNeighborsRegressor()) ],
                      final_estimator= RandomForestRegressor() )

In [44]:
#sr.fit(x_train , y_train)

 **from above it is clear that the best model is Bagging with random forest**

#### Hyperparameter Tuning
    1.Choose following method for hyperparameter tuning
        a.RandomizedSearchCV --> Fast way to Hypertune model
        b.GridSearchCV--> Slow way to hypertune my model
    
    2.Assign hyperparameters in form of dictionary
    3.Fit the model
    4.Check best paramters and best score

In [45]:
from sklearn.model_selection import RandomizedSearchCV

In [46]:
# Number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=6)]

# Number of features to consider at every split
max_features=['auto','sqrt']

# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(5,30,num=4)]

# Minimum number of samples required to split a node
min_samples_split=[5,10,15,100]

In [47]:
# Create the random grid

random_grid={
    'n_estimators':n_estimators,
    'max_features':max_features,
'max_depth':max_depth,
    'min_samples_split':min_samples_split
}

In [48]:
rs = RandomizedSearchCV(RandomForestRegressor() ,random_grid , cv =3,verbose=2,n_jobs=-1)

In [49]:
rs.fit(x_train , y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  warn(


In [54]:
rs.best_params_

{'n_estimators': 320,
 'min_samples_split': 5,
 'max_features': 'auto',
 'max_depth': 13}

In [55]:
y_pred = rs.predict(x_test)

In [56]:
rs.score(x_train , y_train)

0.9211538756925014

In [57]:
r2_score(y_pred , y_test)

0.8385507293995125

 **Conclusion: from above it is clear that the best model is Bagging with random forest**
 
 bag = BaggingRegressor( base_estimator = RandomForestRegressor() )

In [58]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline