# Ensembling Modelling

You take predictions from multiple ML models and combine them in some manner to create a final prediction.

### <b>4 Techniques: </b>
1. Max Voting
2. Average
3. Weighted Averaging
4. Rank Averaging


## <b>Max Voting:</b> 
Whatever my models tell me the most, is what I’ll go ahead with. So, here, I take the outcome from individual models and just take a vote. Only for Classification problem


In [3]:
# Import Titanic Dataset which is classification problem
import pandas as pd
import numpy as np

In [2]:
data_mv=pd.read_csv('data_cleaned.csv')

In [6]:
data_mv.shape

(891, 25)

In [7]:
data_mv.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [14]:
X_mv = data_mv.drop(['Survived'], axis=1)
y_mv = data_mv['Survived']

In [18]:
# Do the train test split

from sklearn.model_selection import train_test_split

X_mv_train, X_mv_test, y_mv_train, y_mv_test = train_test_split(X_mv, y_mv, random_state=101, stratify=y_mv)

In [20]:
# Checking if data looks correct
X_mv_test.shape, X_mv_train.shape, y_mv_test.shape, y_mv_train.shape

((223, 24), (668, 24), (223,), (668,))

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [29]:
# Logistic Regression Model
model1 = LogisticRegression(solver='liblinear')
model1.fit(X_mv_train, y_mv_train)

predict1 = model1.predict(X_mv_test)

predict1[:10], model1.score(X_mv_test, y_mv_test)

(array([0, 0, 0, 0, 1, 1, 0, 1, 0, 0], dtype=int64), 0.7757847533632287)

In [36]:
# K Nearest Neighbour Model
model2 = KNeighborsClassifier(n_neighbors=5)
model2.fit(X_mv_train, y_mv_train)

predict2 = model2.predict(X_mv_test)

predict2[:10], model2.score(X_mv_test, y_mv_test)

(array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0], dtype=int64), 0.7399103139013453)

In [40]:
# Decision Tree Model

model3 = DecisionTreeClassifier(max_depth=8)
model3.fit(X_mv_train, y_mv_train)
predict3 = model3.predict(X_mv_test)

predict3[:10], model3.score(X_mv_test, y_mv_test)

(array([1, 0, 0, 1, 1, 1, 0, 0, 0, 0], dtype=int64), 0.7847533632286996)

In [50]:
# Create an array

from statistics import mode
final_pred = []

for i in range(0, len(y_mv_test)):
    final_pred = np.append(final_pred, mode([predict1[i], predict2[i], predict3[i]]))


In [51]:
# Manually checking the results
predict1[:10], predict2[:10], predict3[:10], final_pred[:10]

(array([0, 0, 0, 0, 1, 1, 0, 1, 0, 0], dtype=int64),
 array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0], dtype=int64),
 array([1, 0, 0, 1, 1, 1, 0, 0, 0, 0], dtype=int64),
 array([1., 0., 0., 0., 1., 1., 0., 0., 0., 0.]))

In [53]:
from sklearn.metrics import accuracy_score 

accuracy_score(y_mv_test, final_pred)

0.8026905829596412

Thus, the final model created has better accuracy than the 3 individual models

In [54]:
X_mv.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,0,0,1,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
1,38.0,71.2833,1,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,26.0,7.925,0,0,1,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,35.0,53.1,1,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,8.05,0,0,1,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1


## <b>Averaging:</b> 
Can be applied to the Regression problem. Take average of whatever my individual models are telling.
Here, I’m not taking into consideration the accuracy of individual models. I’m giving each model equal weight and its not a good strategy.


In [62]:
data_avg = pd.read_csv('train_cleaned.csv')

In [63]:
data_avg.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Fat_Content_LF,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,Item_Fat_Content_reg,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,1999,3735.138,0,1,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,5.92,0.019278,48.2692,2009,443.4228,0,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
2,17.5,0.01676,141.618,1999,2097.27,0,1,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,19.2,0.0,182.095,1998,732.38,0,0,1,0,0,...,0,0,0,0,0,1,1,0,0,0
4,8.93,0.0,53.8614,1987,994.7052,0,1,0,0,0,...,1,0,0,0,0,1,0,1,0,0


In [67]:
X_avg = data_avg.drop('Item_Outlet_Sales', axis=1)
y_avg = data_avg.Item_Outlet_Sales

In [70]:
from sklearn.model_selection import train_test_split

X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(X_avg, y_avg, random_state=111, shuffle=False)

In [72]:
#Checking if all the values looks ok?
X_avg_train.shape, X_avg_test.shape, y_avg_train.shape, y_avg_test.shape

((6392, 45), (2131, 45), (6392,), (2131,))

In [74]:
# Here, we will import LinearRegressor instead of LogisticRegression that we imported for Classification problem

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [78]:
# Linear Model
avg_model1 = LinearRegression()
avg_model1.fit(X_avg_train, y_avg_train)

avg_pred1 = avg_model1.predict(X_avg_test)
avg_pred1[:10], avg_model1.score(X_avg_test, y_avg_test)

(array([ 2443.32513354,  2788.36657095,  2169.73477778,  2385.33394477,
         2716.58813899,  3771.41850023,  5119.35227101,  4133.40997069,
        -1010.95412123,  1365.9929378 ]),
 0.555158243374883)

In [89]:
# KNN model

avg_model2 = KNeighborsRegressor(n_neighbors=9)
avg_model2.fit(X_avg_train, y_avg_train)

avg_pred2 = avg_model2.predict(X_avg_test)
avg_pred2[:10], avg_model2.score(X_avg_test, y_avg_test)

(array([1859.06155556, 2519.09128889, 2620.88471111, 2277.33191111,
        3801.12617778, 4099.10866667, 5151.88642222, 5378.62831111,
         476.7128    , 1501.15706667]),
 0.5007936087911619)

In [94]:
# Decision Tree Model

avg_model3 = DecisionTreeRegressor(max_depth=5)
avg_model3.fit(X_avg_train, y_avg_train)

avg_pred3 = avg_model3.predict(X_avg_test)
avg_pred3[:10], avg_model3.score(X_avg_test, y_avg_test)

(array([2490.41525738, 3002.0365178 ,  552.86537347, 2490.41525738,
        3002.0365178 , 3906.58061931, 6414.42182571, 4882.56988706,
         114.38444   , 1539.95516351]),
 0.5932206202769862)

In [170]:
avg_model1.score(X_avg_test, y_avg_test), avg_model2.score(X_avg_test, y_avg_test), avg_model3.score(X_avg_test, y_avg_test)

(0.555158243374883, 0.5007936087911619, 0.5932206202769862)

In [171]:
from statistics import mean
avg_final_pred = []

for i in range (0, len(X_avg_test)):
    avg_final_pred = np.append(avg_final_pred, mean([avg_pred1[i],  avg_pred2[i], avg_pred3[i]]))

In [172]:
from sklearn.metrics import r2_score

r2_score(y_avg_test, avg_final_pred)

0.5812946500631868

In [173]:
r2_score(y_avg_test, avg_pred1), r2_score(y_avg_test, avg_pred2), r2_score(y_avg_test, avg_pred3)

# We can see that Average is predicting better but not the best

(0.555158243374883, 0.5007936087911619, 0.5932206202769862)

## <b>Weighted Average: </b>

Give each model a weight (as per R-sq on validation) and multiply the weight to individual prediction and take their mean. This should give better results as I have given more weight to models which were performing well.

In [174]:
avg_model1.score(X_avg_test, y_avg_test), avg_model2.score(X_avg_test, y_avg_test), avg_model3.score(X_avg_test, y_avg_test)

# Model 1 and 3 are performing better than model 2. So, we will give weight of 2 to model1 n model3 and weight 1 to model2.

(0.555158243374883, 0.5007936087911619, 0.5932206202769862)

In [175]:
from statistics import mean
avg_final_pred = []

for i in range (0, len(X_avg_test)):
    avg_final_pred = np.append(avg_final_pred, mean([avg_pred1[i], avg_pred1[i], avg_pred2[i], avg_pred3[i], avg_pred3[i], avg_pred3[i]]))

In [176]:
from sklearn.metrics import r2_score

r2_score(y_avg_test, avg_final_pred)

0.5909167406967766

In [177]:
r2_score(y_avg_test, avg_pred1), r2_score(y_avg_test, avg_pred2), r2_score(y_avg_test, avg_pred3)

# We can see that Average is predicting better but not the best

(0.555158243374883, 0.5007936087911619, 0.5932206202769862)

## <b>Rank Averaging:</b> 
In weighted Average, the weights were decided by me. Another way to do it instead of me deciding how much weight to give each model is by giving each model a rank. Worst Model -> 1, Best Model -> n. Then sum up these ranks and divide each of them with total value. This gives weights.
Multiply weights with each model and then take sum of all these values (no need to take mean as the weights are already normalized)


In [178]:
index = [1,2,3]
rank = [r2_score(y_avg_test, avg_pred1), r2_score(y_avg_test, avg_pred2), r2_score(y_avg_test, avg_pred3)]
ranking_data = pd.DataFrame({
    "accuracy": rank
}, index=index
)

ranking_data

Unnamed: 0,accuracy
1,0.555158
2,0.500794
3,0.593221


In [179]:
sorted_rank = ranking_data.sort_values('accuracy')
sorted_rank

Unnamed: 0,accuracy
2,0.500794
1,0.555158
3,0.593221


In [180]:
sorted_rank['rank'] = [i for i in range(1, len(sorted_rank)+1)]
sorted_rank


Unnamed: 0,accuracy,rank
2,0.500794,1
1,0.555158,2
3,0.593221,3


In [181]:
sorted_rank['weight'] = sorted_rank['rank']/sorted_rank['rank'].sum()
sorted_rank

Unnamed: 0,accuracy,rank,weight
2,0.500794,1,0.166667
1,0.555158,2,0.333333
3,0.593221,3,0.5


In [182]:
# Resetting the index
sorted_rank.reset_index(inplace=True, drop=True)
sorted_rank

Unnamed: 0,accuracy,rank,weight
0,0.500794,1,0.166667
1,0.555158,2,0.333333
2,0.593221,3,0.5


In [183]:
sorted_rank.weight.loc[0]

0.16666666666666666

In [187]:
avg_rank_final_pred = []

for i in range (0, len(X_avg_test)):
    avg_rank_final_pred = np.append(avg_rank_final_pred, ([avg_pred1[i]*sorted_rank.weight.loc[0] + avg_pred2[i]*sorted_rank.weight.loc[1] + avg_pred3[i]*sorted_rank.weight.loc[2]]))

In [188]:
avg_rank_final_pred.shape

(2131,)

In [190]:
r2_score(y_avg_test, avg_rank_final_pred)

0.5857745304925561

In [193]:
# Method used in reference Doc

wt_pred1 = avg_pred1*float(sorted_rank.loc[[0],['weight']].values)
wt_pred2 = avg_pred2*float(sorted_rank.loc[[1],['weight']].values)
wt_pred3 = avg_pred3*float(sorted_rank.loc[[2],['weight']].values)
ranked_prediction = wt_pred1 + wt_pred2 + wt_pred3
ranked_prediction

array([2272.11566946, 2805.44311702, 1511.68338673, ..., 1381.75926046,
       1615.67915588, 1123.23513038])

In [194]:
r2_score(y_avg_test, ranked_prediction)

0.5857745304925561