# Voting, Stacking and Blending

In [1]:
%pylab inline
import numpy as np
import pandas as pd
import pylab as plt

import warnings
warnings.filterwarnings("ignore")

Populating the interactive namespace from numpy and matplotlib


# Задача регрессии

In [2]:
from sklearn.datasets import load_boston

data = load_boston()

X_full = data.data
y_full = data.target

In [3]:
from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(X_full, y_full, test_size=100, 
                                        random_state=241)
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.35, 
                                        random_state=241)

In [4]:
from sklearn.model_selection import KFold, cross_val_score
cv = KFold(Xtrain.shape[0], shuffle=True, random_state=241)

In [5]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
print(cross_val_score(rf, Xtrain, ytrain, cv=cv, scoring ='neg_mean_squared_error').mean())

-15.82817946768061


In [6]:
from sklearn.metrics import mean_squared_error

rf.fit(Xtrain, ytrain)
pred_rf = rf.predict(Xval)
mean_squared_error(pred_rf, yval)

9.626457342657343

In [7]:
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor()
print(cross_val_score(ada, Xtrain, ytrain, cv=cv, scoring ='neg_mean_squared_error').mean())

-15.623140045676427


In [8]:
ada.fit(Xtrain, ytrain)
pred_ada = ada.predict(Xval)
mean_squared_error(pred_ada, yval)

12.92134697989273

# Blending

1) смешаем алгоритмы с весами 1:1, то есть в качестве ответа предскажем среднее значение смеси

In [9]:
pred_mix_1 = 0.5 *(pred_rf + pred_ada)
print('Validation prediction:',mean_squared_error(pred_mix_1, yval))

rf_test = rf.predict(X_test)
ada_test = ada.predict(X_test)
pred_test_1 = 0.5 *(rf_test + ada_test)
print('Test prediction:',mean_squared_error(pred_test_1, y_test))

Validation prediction: 10.101848464878785
Test prediction: 9.152245198242715


2) Смешаем алгоритмы с весами $\alpha$ : $1-\alpha$. Подберём $\alpha$ в цикле

In [10]:
max_score = -100
opt_alpha = 0

for alpha in np.arange(0.01,1.01,0.01):
    pred_mix_2 = alpha * pred_rf + (1-alpha) * pred_ada
    curr_score = -mean_squared_error(pred_mix_2, yval)
    
    if curr_score > max_score:
        max_score = curr_score
        opt_alpha = alpha
        
print(opt_alpha, -max_score)

0.85 9.52294340308056


In [11]:
rf_test = rf.predict(X_test)
ada_test = ada.predict(X_test)
pred_test_2 = opt_alpha * rf_test + (1-opt_alpha) * ada_test
print('Test prediction:',mean_squared_error(pred_test_2, y_test))

Test prediction: 10.415749003637197


# Stacking

In [12]:
SecondTrain = pd.DataFrame(np.nan, index=np.arange(len(Xval)), columns=['RF','ADA'])

SecondTrain['RF'] = rf.predict(Xval)
SecondTrain['ADA'] = ada.predict(Xval)

In [13]:
meta = RandomForestRegressor()
meta.fit(SecondTrain,yval)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
rf_test = rf.predict(X_test)
ada_test = ada.predict(X_test)

FirstLevelPred = pd.DataFrame(np.nan, index=np.arange(len(X_test)), columns=['RF','ADA'])

FirstLevelPred['RF'] = rf.predict(X_test)
FirstLevelPred['ADA'] = ada.predict(X_test)

In [15]:
final_pred = meta.predict(FirstLevelPred)

print('Final prediction:',mean_squared_error(final_pred,y_test))

Final prediction: 12.506756000000003


# Задача классификации

In [16]:
from sklearn.datasets import load_iris

data = load_iris()

X_full = data.data
y_full = data.target

In [18]:
X, X_test, y, y_test = train_test_split(X_full, y_full, test_size=100, 
                                        random_state=135)

In [19]:
from sklearn.model_selection import KFold, cross_val_score
cv = KFold(X.shape[0], shuffle=True, random_state=123)

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
print(cross_val_score(rf, X, y, cv=cv, scoring ='accuracy').mean())

0.94


In [21]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
print(cross_val_score(lr, X, y, cv=cv, scoring ='accuracy').mean())

0.88


# Voting

In [22]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

voting = VotingClassifier(estimators=[('RF',rf),('LR',lr)], voting='soft')
voting.fit(X, y)
pred_voting = voting.predict(X_test)

print('Accuracy on test:', accuracy_score(pred_voting,y_test))

Accuracy on test: 0.95


# Задание

1) Разбейте X, y на тренировочную и валидационную части

2) Сделайте предсказание с помощью блендинга (предсказывайте вероятности классов, а не сами классы - методом predict_proba)

3) Сделайте предсказание с помощью стекинга (предсказывайте вероятности классов)

**Blending**

In [None]:
#Your code here
...

pred_proba_rf = rf.predict_proba(X_test)
pred_proba_lr = lr.predict_proba(X_test)

In [None]:
mix_pred = 0.5 * (pred_proba_rf + pred_proba_lr)

pred_final = [np.argmax(elem) for elem in mix_pred]

print('Accuracy blending:', accuracy_score(pred_final, y_test))

**Stacking**

In [None]:
#Your code here

# Готовое решение для stacking

Вернемся к задаче регрессии с датасетом Boston.

In [23]:
from vecstack import stacking

models = [RandomForestRegressor(),AdaBoostRegressor()]

S_train, S_test = stacking(models, X, y, X_test, regression=True, verbose=1)

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [2]

model  0:     [RandomForestRegressor]
    ----
    MEAN:     [0.07019231] + [0.01442308]
    FULL:     [0.07000000]

model  1:     [AdaBoostRegressor]
    ----
    MEAN:     [0.04006410] + [0.04012815]
    FULL:     [0.04000000]



In [24]:
meta = RandomForestRegressor()
meta.fit(S_train, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [25]:
final_pred = meta.predict(S_test)

print('Final prediction:',mean_squared_error(final_pred,y_test))

Final prediction: 0.05015277777777778


# Задание

Используйте библиотеку vecstack для решения задачи классификации Iris с помощью stacking

In [None]:
#your code here

# Задание

Решите задачу с данными train_medium.csv, test_medium.csv, используя stacking и/или blending нескольких алгоритмов. Сравните полученное качество (на кросс-валидации) с качеством, полученным с помощью xgboost, catboost, lightgbm в отдельности.

In [None]:
#your code here