In [47]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import pandas as pd
from collections import Counter
from sklearn import datasets
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
import graphviz
import sys
sys.path.insert(0, 'C:/Users/grzesiek/Documents/Data Science/PycharmProjects/ML_from_scratch/adaboost_classifier')

# AdaBoost from scratch

In [17]:
from adaboost_cls import accuracy, AdaboostCls

In [18]:
data = datasets.load_breast_cancer()
X = data.data
y = data.target

y[y == 0] = -1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [19]:
cls = AdaboostCls(n_cls=30)
cls.fit(X_train, y_train)
y_pred = cls.predict(X_test)

In [20]:
acc = accuracy(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.956140350877193


# AdaBoost from sklearn

In [23]:
parameters={
    'estimator': None,
    'n_estimators': 50,
    'learning_rate': 1,
    'algorithm': 'SAMME.R', # {‘SAMME’, ‘SAMME.R’}
    'random_state': None,
    'base_estimator': None
}

clf = AdaBoostClassifier(
    n_estimators=parameters['n_estimators'], 
    learning_rate=parameters['learning_rate'],
    algorithm=parameters['algorithm'],
    random_state=parameters['random_state'],
    base_estimator=parameters['base_estimator'],
    )
clf = clf.fit(X_train, y_train)

## Attributes

In [27]:
# attributes:
atts = {att for att in dir(clf) if (att[-1]=='_') & (att[0]!='_')}
print('Attributes of AdaBoost:', atts)

Attributes of AdaBoost: {'n_classes_', 'estimator_weights_', 'base_estimator_', 'estimator_errors_', 'classes_', 'n_features_in_', 'feature_importances_', 'estimators_'}


In [28]:
# feature importance
df=pd.DataFrame(np.transpose([clf.feature_importances_]), index=data['feature_names'], columns=['feature_importance'])

filt = df['feature_importance']>0
(df[filt]).sort_values(by='feature_importance', ascending=False).style.bar("feature_importance")

Unnamed: 0,feature_importance
mean texture,0.1
area error,0.08
worst area,0.08
fractal dimension error,0.08
worst concave points,0.08
mean symmetry,0.06
mean concave points,0.06
worst smoothness,0.06
worst concavity,0.06
compactness error,0.06


In [29]:
# scalar attributes
for att, val in clf.__dict__.items():
    if att in atts-{'tree_', 'feature_importances_'}:
        print(att,'=', val)

n_features_in_ = 30
base_estimator_ = DecisionTreeClassifier(max_depth=1)
estimators_ = [DecisionTreeClassifier(max_depth=1, random_state=829105117), DecisionTreeClassifier(max_depth=1, random_state=402503606), DecisionTreeClassifier(max_depth=1, random_state=1889183905), DecisionTreeClassifier(max_depth=1, random_state=228316965), DecisionTreeClassifier(max_depth=1, random_state=635024101), DecisionTreeClassifier(max_depth=1, random_state=1251262236), DecisionTreeClassifier(max_depth=1, random_state=1603722564), DecisionTreeClassifier(max_depth=1, random_state=158813545), DecisionTreeClassifier(max_depth=1, random_state=403080719), DecisionTreeClassifier(max_depth=1, random_state=158288437), DecisionTreeClassifier(max_depth=1, random_state=135415228), DecisionTreeClassifier(max_depth=1, random_state=1611461270), DecisionTreeClassifier(max_depth=1, random_state=1683075054), DecisionTreeClassifier(max_depth=1, random_state=683384947), DecisionTreeClassifier(max_depth=1, random_state=108

## Methods

In [31]:
meths = ['decision_function', 
         'fit',
         'get_params',
         'predict',
         'predict_log_proba',
         'predict_proba',
         'score',
         'set_params',
         'staged_decision_function',
         'staged_predict',
         'staged_predict_proba',
         'staged_score' ]
meths

['decision_function',
 'fit',
 'get_params',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'set_params',
 'staged_decision_function',
 'staged_predict',
 'staged_predict_proba',
 'staged_score']

In [32]:
clf.decision_function(X_test)

array([-1.55036455,  0.41101858,  0.18727399,  0.64056842,  0.45948474,
       -1.5921498 ,  0.44506692,  1.02308231,  0.1399459 ,  0.67740464,
        0.44367462,  0.35549207, -1.55272949,  0.44000427,  0.5677652 ,
        0.88604292, -0.46261554,  0.88875922,  0.3715552 ,  0.61216621,
       -0.97971997,  0.59666867,  0.66345596,  0.51999269,  0.34694698,
        0.54472906,  0.24773741, -1.39246299,  0.10098022, -1.25439832,
       -1.36162732, -1.37771662, -0.15966279, -1.14957802,  0.62862368,
        0.68600867, -0.30729471,  0.72433571,  0.42501674, -1.3581228 ,
        0.572812  ,  0.65770335,  0.68324742, -1.22689351,  0.71984692,
        0.28612479, -1.23416629, -0.52777081,  0.6076521 , -1.42547559,
        0.42410433,  0.87019689,  0.57700591,  0.37257976,  0.27429953,
       -1.44246359, -1.19404481, -1.36162732,  0.59084669, -0.615924  ,
       -0.44729423, -0.39981545,  0.46483061,  0.62405114,  0.66867138,
        0.3050768 ,  0.32317353,  0.41503696,  0.34773586, -1.21

In [33]:
clf.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 1,
 'n_estimators': 50,
 'random_state': None}

In [36]:
clf.predict_log_proba(X_test[0:2])

array([[-0.19241265, -1.74277719],
       [-0.91962652, -0.50860793]])

In [39]:
clf.predict_proba(X_test[0:1])

array([[0.82496638, 0.17503362]])

In [42]:
list(clf.staged_predict(X_test[0:1]))

[array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1]),
 array([-1])]

In [55]:
list(clf.staged_predict(X_train))

[array([ 1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1, -1, -1,  1,
         1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,
         1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
         1, -1, -1, -1,  1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
        -1, -1, -1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
         1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1,  1,
         1,  1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1,  1,  1,
         1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,
         1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1,
        -1, -1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,
         1,  1, -1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,
         1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,
         1, 

In [46]:
list(clf.staged_score(X_test, y_test))

[0.9122807017543859,
 0.9122807017543859,
 0.956140350877193,
 0.9649122807017544,
 0.9649122807017544,
 0.9736842105263158,
 0.9736842105263158,
 0.9649122807017544,
 0.9649122807017544,
 0.9736842105263158,
 0.9649122807017544,
 0.9649122807017544,
 0.9649122807017544,
 0.9736842105263158,
 0.9649122807017544,
 0.9649122807017544,
 0.9649122807017544,
 0.9736842105263158,
 0.9649122807017544,
 0.9649122807017544,
 0.9649122807017544,
 0.9649122807017544,
 0.9649122807017544,
 0.9649122807017544,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.9736842105263158,
 0.97368421052

## Display

How was the AdaBoost score changing during the trenning?

In [52]:
y = list(clf.staged_score(X_train, y_train))

fig = go.Figure(data=go.Scatter(y=y, mode='markers+lines'))
fig.update_layout(title='AdaBoost score during trenning')
fig.update_xaxes(title='Model iteration (added new weak learner)')
fig.update_yaxes(title='Model score')
fig.show()

How was the AdaBoost prediction changing for a specific element during trenning?

In [91]:
i = 3
y = np.array(list(clf.staged_predict(X_train[i:i+1]))).flatten()

fig = go.Figure(data=go.Scatter(y=y, mode='markers+lines'))
fig.update_layout(title=f'AdaBoost prediction for x[{i}] during trenning')
fig.update_xaxes(title='Model iteration (added new weak learner)')
fig.update_yaxes(title='Model prediction')
fig.show()