In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
import warnings
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [5]:
cancer = pd.read_csv(r"C:\Training\Academy\Statistics (Python)\Cases\Wisconsin\BreastCancer.csv",index_col=0)
X = cancer.drop('Class', axis=1)
y = cancer['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3, stratify=y)

In [9]:
knn = KNeighborsClassifier()
nb = GaussianNB()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
lr = LogisticRegression(random_state=24)
stack = StackingClassifier(estimators=[('KNN',knn),('NB',nb),('TREE',dtc),('SVM',svm)],
                           final_estimator=lr)

In [11]:
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9714285714285714


In [13]:
y_pred_prob = stack.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.990841384863124


Using `passthrough=True` option

In [17]:
stack = StackingClassifier(estimators=[('KNN',knn),('NB',nb),('TREE',dtc),('SVM',svm)],
                           final_estimator=lr, passthrough=True)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9714285714285714


In [19]:
y_pred_prob = stack.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.9959742351046699


## Glass Identification

In [24]:
glass = pd.read_csv(r"C:\Training\Academy\Statistics (Python)\Cases\Glass Identification\Glass.csv")
X = glass.drop('Type', axis=1)
y = glass['Type']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3, stratify=y)
knn = KNeighborsClassifier()
nb = GaussianNB()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
lr = LogisticRegression(random_state=24)

w/o `passthrough`

In [29]:
stack = StackingClassifier(estimators=[('KNN',knn),('NB',nb),('TREE',dtc),('SVM',svm)],
                           final_estimator=lr)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test, y_pred))
y_pred_prob = stack.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.7230769230769231
0.7261085625838982


with `passthrough`

In [32]:
stack = StackingClassifier(estimators=[('KNN',knn),('NB',nb),('TREE',dtc),('SVM',svm)],
                           final_estimator=lr, passthrough=True)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test, y_pred))
y_pred_prob = stack.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.7076923076923077
0.7747824074555355


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Grid Search

In [39]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
rf = RandomForestClassifier(random_state=24)
stack = StackingClassifier(estimators=[('KNN',knn),('NB',nb),('TREE',dtc),('SVM',svm)],
                           final_estimator=rf)
stack.get_params()

{'cv': None,
 'estimators': [('KNN', KNeighborsClassifier()),
  ('NB', GaussianNB()),
  ('TREE', DecisionTreeClassifier(random_state=24)),
  ('SVM', SVC(probability=True, random_state=24))],
 'final_estimator__bootstrap': True,
 'final_estimator__ccp_alpha': 0.0,
 'final_estimator__class_weight': None,
 'final_estimator__criterion': 'gini',
 'final_estimator__max_depth': None,
 'final_estimator__max_features': 'sqrt',
 'final_estimator__max_leaf_nodes': None,
 'final_estimator__max_samples': None,
 'final_estimator__min_impurity_decrease': 0.0,
 'final_estimator__min_samples_leaf': 1,
 'final_estimator__min_samples_split': 2,
 'final_estimator__min_weight_fraction_leaf': 0.0,
 'final_estimator__monotonic_cst': None,
 'final_estimator__n_estimators': 10,
 'final_estimator__n_jobs': None,
 'final_estimator__oob_score': True,
 'final_estimator__random_state': 24,
 'final_estimator__verbose': 0,
 'final_estimator__warm_start': False,
 'final_estimator': RandomForestClassifier(n_estimators=

In [63]:
params = {'final_estimator__max_depth':[3,4,5] , 'SVM__C':np.linspace(0.001, 3, 5),
          'TREE__max_depth':[None, 2, 4], 'final_estimator__n_estimators':[10, 50],
          'passthrough':[True, False]}
gcv = GridSearchCV(stack, param_grid=params, cv=kfold, scoring='neg_log_loss',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 1/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.696 total time=   0.0s
[CV 2/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.924 total time=   0.0s
[CV 3/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.974 total time=   0.0s
[CV 4/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.867 total time=   0.0s
[CV 5/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.919 total time=   0.0s
[CV 1/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, pass

In [64]:
print(gcv.best_params_)
print(gcv.best_score_)

{'SVM__C': 3.0, 'TREE__max_depth': 2, 'final_estimator__max_depth': 5, 'final_estimator__n_estimators': 50, 'passthrough': True}
-0.7486144895037242


## Serializing the fitted object

In [68]:
best_stack = gcv.best_estimator_

In [78]:
import pickle 
pkfile = open("C:/Training/Academy/Statistics (Python)/Cases/Glass Identification/stack_gls.pkl", 'wb') 
pickle.dump(best_stack, pkfile) 

In [76]:
best_stack