# Stacking 

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from vecstack import stacking
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import roc_auc_score

In [2]:
link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
names = ['Class', 'Alcohol', 'Malic acid', 'Ash',
         'Alcalinity of ash' ,'Magnesium', 'Total phenols',
         'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',     'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
         'Proline']
df = pd.read_csv(link, header=None, names=names)
df.sample(5)

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
81,2,12.72,1.81,2.2,18.8,86,2.2,2.53,0.26,1.77,3.9,1.16,3.14,714
66,2,13.11,1.01,1.7,15.0,78,2.98,3.18,0.26,2.28,5.3,1.12,3.18,502
112,2,11.76,2.68,2.92,20.0,103,1.75,2.03,0.6,1.05,3.8,1.23,2.5,607
83,2,13.05,3.86,2.32,22.5,85,1.65,1.59,0.61,1.62,4.8,0.84,2.01,515
152,3,13.11,1.9,2.75,25.5,116,2.2,1.28,0.26,1.56,7.1,0.61,1.33,425


In [3]:
df.Class.value_counts()

2    71
1    59
3    48
Name: Class, dtype: int64

In [4]:
y = df[['Class']]
X = df.iloc[:,1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
models = [
    KNeighborsClassifier(n_neighbors=5,
                        n_jobs=-1),
        
    RandomForestClassifier(random_state=0, n_jobs=-1, 
                           n_estimators=100, max_depth=3),
        
    XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                  n_estimators=100, max_depth=3)
]

In [12]:
def auc(y_true, y_pred):
    """ROC AUC metric for both binary and multiclass classification.
    
    Parameters
    ----------
    y_true : 1d numpy array
        True class labels
    y_pred : 2d numpy array
        Predicted probabilities for each class
    """
    ohe = OneHotEncoder(sparse=False)
    y_true = ohe.fit_transform(y_true.reshape(-1, 1))
    auc_score = roc_auc_score(y_true, y_pred)
    return auc_score

 using first level models to make predictions



In [13]:
S_train, S_test = stacking(models,                   
                           X_train, y_train, X_test,   
                           regression=False, 
     
                           mode='oof_pred_bag', 
       
                           needs_proba=False,
         
                           save_dir=None, 
            
                           metric=accuracy_score, 
    
                           n_folds=4, 
                 
                           stratified=True,
            
                           shuffle=True,  
            
                           random_state=0,    
         
                           verbose=2)

  y = column_or_1d(y, warn=True)


task:         [classification]
n_classes:    [3]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.69444444]
    fold  1:  [0.63888889]
    fold  2:  [0.62857143]
    fold  3:  [0.65714286]
    ----
    MEAN:     [0.65476190] + [0.02509117]
    FULL:     [0.65492958]

model  1:     [RandomForestClassifier]
    fold  0:  [0.97222222]
    fold  1:  [0.97222222]
    fold  2:  [0.97142857]
    fold  3:  [1.00000000]
    ----
    MEAN:     [0.97896825] + [0.01214701]
    FULL:     [0.97887324]

model  2:     [XGBClassifier]
    fold  0:  [0.94444444]
    fold  1:  [0.94444444]
    fold  2:  [0.97142857]
    fold  3:  [1.00000000]
    ----
    MEAN:     [0.96507937] + [0.02297479]
    FULL:     [0.96478873]



The stacking function takes several inputs:
- models: the first level models we defined earlier
- X_train, y_train, X_test: our data
- regression: Boolean indicating whether we want to use the function for regression. In our case set to False since this is a classification
- mode: using the earlier describe out-of-fold during cross-validation
- needs_proba: Boolean indicating whether you need the probabilities of class labels
- save_dir: save the result to directory Boolean
- metric: what evaluation metric to use (we imported the accuracy_score in the beginning)
- n_folds: how many folds to use for cross-validation
- stratified: whether to use stratified cross-validation
- shuffle: whether to shuffle the data
- random_state: setting a random state for reproducibility
- verbose: 2 here refers to printing all info

ll that’s left to do now is fit the second level model(s) of our choice on our predictions to make our final predictions.

In our case, we are going to use an XGBoost Classifier. This step is not significantly different from a regular fit-and-predict in sklearn except for the fact that instead of using X_train to train our model, we are using our predictions S_train.

In [14]:
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score: [0.94444444]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## Conclusion

Using vecstacks’ stacking automation, we’ve managed to predict the correct wine cultivar with an accuracy of approximately 94.4%!