In [1]:
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
import warnings

In [2]:
cancer = pd.read_csv(r"C:\Training\Academy\Statistics (Python)\Cases\Wisconsin\BreastCancer.csv",index_col=0)
X = cancer.drop('Class', axis=1)
y = cancer['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3, stratify=y)

In [3]:
ada = AdaBoostClassifier(n_estimators=50, random_state=24)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9761904761904762




In [4]:
dtc = DecisionTreeClassifier(max_depth=3, random_state=24)
ada = AdaBoostClassifier(estimator=dtc, n_estimators=50, random_state=24)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
print(accuracy_score(y_test, y_pred))



0.9619047619047619


Grid Search CV

In [6]:
dtc = DecisionTreeClassifier(random_state=24)
ada = AdaBoostClassifier(estimator=dtc, random_state=24)
ada.get_params()

{'algorithm': 'SAMME.R',
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__monotonic_cst': None,
 'estimator__random_state': 24,
 'estimator__splitter': 'best',
 'estimator': DecisionTreeClassifier(random_state=24),
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': 24}

In [7]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'estimator__max_depth':[1,2,3],'n_estimators':[10,50,100] }
gcv = GridSearchCV(ada, param_grid=params, cv=kfold, scoring='neg_log_loss',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




[CV 1/5] END estimator__max_depth=1, n_estimators=10;, score=-0.412 total time=   0.0s
[CV 2/5] END estimator__max_depth=1, n_estimators=10;, score=-0.348 total time=   0.0s
[CV 3/5] END estimator__max_depth=1, n_estimators=10;, score=-0.420 total time=   0.0s
[CV 4/5] END estimator__max_depth=1, n_estimators=10;, score=-0.378 total time=   0.0s
[CV 5/5] END estimator__max_depth=1, n_estimators=10;, score=-0.386 total time=   0.0s




[CV 1/5] END estimator__max_depth=1, n_estimators=50;, score=-0.493 total time=   0.0s
[CV 2/5] END estimator__max_depth=1, n_estimators=50;, score=-0.437 total time=   0.0s
[CV 3/5] END estimator__max_depth=1, n_estimators=50;, score=-0.500 total time=   0.0s




[CV 4/5] END estimator__max_depth=1, n_estimators=50;, score=-0.476 total time=   0.0s
[CV 5/5] END estimator__max_depth=1, n_estimators=50;, score=-0.520 total time=   0.0s




[CV 1/5] END estimator__max_depth=1, n_estimators=100;, score=-0.552 total time=   0.1s
[CV 2/5] END estimator__max_depth=1, n_estimators=100;, score=-0.508 total time=   0.1s




[CV 3/5] END estimator__max_depth=1, n_estimators=100;, score=-0.570 total time=   0.1s




[CV 4/5] END estimator__max_depth=1, n_estimators=100;, score=-0.550 total time=   0.1s
[CV 5/5] END estimator__max_depth=1, n_estimators=100;, score=-0.582 total time=   0.1s




[CV 1/5] END estimator__max_depth=2, n_estimators=10;, score=-0.314 total time=   0.0s
[CV 2/5] END estimator__max_depth=2, n_estimators=10;, score=-0.155 total time=   0.0s
[CV 3/5] END estimator__max_depth=2, n_estimators=10;, score=-0.139 total time=   0.0s
[CV 4/5] END estimator__max_depth=2, n_estimators=10;, score=-0.207 total time=   0.0s
[CV 5/5] END estimator__max_depth=2, n_estimators=10;, score=-0.293 total time=   0.0s
[CV 1/5] END estimator__max_depth=2, n_estimators=50;, score=-0.111 total time=   0.0s




[CV 2/5] END estimator__max_depth=2, n_estimators=50;, score=-0.129 total time=   0.0s
[CV 3/5] END estimator__max_depth=2, n_estimators=50;, score=-0.116 total time=   0.0s
[CV 4/5] END estimator__max_depth=2, n_estimators=50;, score=-0.162 total time=   0.0s




[CV 5/5] END estimator__max_depth=2, n_estimators=50;, score=-0.159 total time=   0.0s




[CV 1/5] END estimator__max_depth=2, n_estimators=100;, score=-0.119 total time=   0.1s
[CV 2/5] END estimator__max_depth=2, n_estimators=100;, score=-0.152 total time=   0.1s




[CV 3/5] END estimator__max_depth=2, n_estimators=100;, score=-0.140 total time=   0.1s
[CV 4/5] END estimator__max_depth=2, n_estimators=100;, score=-0.174 total time=   0.1s




[CV 5/5] END estimator__max_depth=2, n_estimators=100;, score=-0.164 total time=   0.1s
[CV 1/5] END estimator__max_depth=3, n_estimators=10;, score=-0.143 total time=   0.0s
[CV 2/5] END estimator__max_depth=3, n_estimators=10;, score=-0.192 total time=   0.0s
[CV 3/5] END estimator__max_depth=3, n_estimators=10;, score=-0.340 total time=   0.0s
[CV 4/5] END estimator__max_depth=3, n_estimators=10;, score=-0.190 total time=   0.0s
[CV 5/5] END estimator__max_depth=3, n_estimators=10;, score=-0.132 total time=   0.0s




[CV 1/5] END estimator__max_depth=3, n_estimators=50;, score=-0.067 total time=   0.0s
[CV 2/5] END estimator__max_depth=3, n_estimators=50;, score=-0.123 total time=   0.0s




[CV 3/5] END estimator__max_depth=3, n_estimators=50;, score=-0.145 total time=   0.0s
[CV 4/5] END estimator__max_depth=3, n_estimators=50;, score=-0.175 total time=   0.0s
[CV 5/5] END estimator__max_depth=3, n_estimators=50;, score=-0.134 total time=   0.0s




[CV 1/5] END estimator__max_depth=3, n_estimators=100;, score=-0.069 total time=   0.1s
[CV 2/5] END estimator__max_depth=3, n_estimators=100;, score=-0.111 total time=   0.1s




[CV 3/5] END estimator__max_depth=3, n_estimators=100;, score=-0.122 total time=   0.1s




[CV 4/5] END estimator__max_depth=3, n_estimators=100;, score=-0.181 total time=   0.1s




[CV 5/5] END estimator__max_depth=3, n_estimators=100;, score=-0.112 total time=   0.1s




In [8]:
print(gcv.best_params_)
print(gcv.best_score_)

{'estimator__max_depth': 3, 'n_estimators': 100}
-0.11919655860648562


### `sklearn` Gradient Boosting 

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

In [11]:
gbm = GradientBoostingClassifier(random_state=24)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3, stratify=y)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9761904761904762


GridSearchCV

In [13]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'n_estimators':[10,50], 'max_depth':[2,3,4], 'learning_rate':np.linspace(0.001,1,5) }
gbm = GradientBoostingClassifier(random_state=24)
gcv = GridSearchCV(gbm, param_grid=params, cv=kfold, scoring='roc_auc',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.952 total time=   0.0s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.940 total time=   0.0s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.961 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.952 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.936 total time=   0.0s
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.973 total time=   0.0s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.940 total time=   0.0s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.962 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.952 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.9

In [14]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.5005, 'max_depth': 2, 'n_estimators': 10}
0.990956792117229


### `xgboost` Boosting

In [16]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [17]:
X = cancer.drop('Class', axis=1)
y = cancer['Class']
le = LabelEncoder()
y = le.fit_transform(y)
dict(zip(list(np.unique(y)),le.classes_))

{0: 'Benign', 1: 'Malignant'}

In [18]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
xgbm = XGBClassifier(random_state=24)
params = {'n_estimators':[10,50], 'max_depth':[2,3,4], 'learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(xgbm, param_grid=params, cv=kfold, scoring='roc_auc',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.961 total time=   0.9s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.941 total time=   0.0s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.954 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.952 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.937 total time=   0.0s
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.961 total time=   0.0s
[CV 2/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.943 total time=   0.0s
[CV 3/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.962 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.952 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=2, n_estimators=50;, score=0.9

In [19]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.25075, 'max_depth': 4, 'n_estimators': 10}
0.9920563686535292


### `lightgbm` Boosting

In [21]:
from lightgbm import LGBMClassifier

In [22]:
lgbm = LGBMClassifier(random_state=24)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'n_estimators':[10,50], 'max_depth':[2,3,4], 'learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(lgbm, param_grid=params, cv=kfold, scoring='roc_auc',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 193, number of negative: 366
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 559, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.345259 -> initscore=-0.639943
[LightGBM] [Info] Start training from score -0.639943
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.970 total time=   0.0s
[LightGBM] [Info] Number of positive: 193, number of negative: 366
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data

In [23]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 50}
0.9916129770111581


### `catboost` Boosting

In [25]:
from catboost import CatBoostClassifier

In [26]:
cgbm = CatBoostClassifier(random_state=24)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'n_estimators':[10,50], 'max_depth':[2,3,4], 'learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(cgbm, param_grid=params, cv=kfold, scoring='roc_auc',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 0.6924045	total: 138ms	remaining: 1.24s
1:	learn: 0.6916960	total: 139ms	remaining: 555ms
2:	learn: 0.6909517	total: 139ms	remaining: 325ms
3:	learn: 0.6902190	total: 140ms	remaining: 209ms
4:	learn: 0.6894729	total: 140ms	remaining: 140ms
5:	learn: 0.6887781	total: 140ms	remaining: 93.5ms
6:	learn: 0.6880340	total: 141ms	remaining: 60.2ms
7:	learn: 0.6872802	total: 141ms	remaining: 35.2ms
8:	learn: 0.6865487	total: 141ms	remaining: 15.7ms
9:	learn: 0.6858238	total: 142ms	remaining: 0us
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.995 total time=   0.1s
0:	learn: 0.6924105	total: 529us	remaining: 4.77ms
1:	learn: 0.6916905	total: 850us	remaining: 3.4ms
2:	learn: 0.6909278	total: 1.12ms	remaining: 2.61ms
3:	learn: 0.6901781	total: 1.37ms	remaining: 2.06ms
4:	learn: 0.6894156	total: 1.69ms	remaining: 1.69ms
5:	learn: 0.6887138	total: 1.98ms	remaining: 1.32ms
6:	learn: 0.6879476	total: 2.25

In [27]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 1.0, 'max_depth': 2, 'n_estimators': 10}
0.9926941034416632
