In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

from tqdm import tqdm_notebook as tqdm

In [9]:
# code is based on
# https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/supervised_learning/adaboost.py
# with modification

from __future__ import division, print_function
import numpy as np
import math
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd

# Decision stump used as weak classifier
class DecisionStump():
    def __init__(self):
        # Determines if sample shall be classified as -1 or 1 given threshold
        self.polarity = 1
        # The index of the feature used to make classification
        self.feature_index = None
        # The threshold value that the feature should be measured against
        self.threshold = None
        # Value indicative of the classifier's accuracy
        self.alpha = None

class Adaboost():
    """Boosting method that uses a number of weak classifiers in 
    ensemble to make a strong classifier. This implementation uses decision
    stumps, which is a one level Decision Tree. 
    Parameters:
    -----------
    n_clf: int
        The number of weak classifiers that will be used. 
    """
    def __init__(self, n_clf=5):
        self.n_clf = n_clf

    def fit(self, X, y, loss='exponential'):
        assert(loss in ['exponential', 'logistic'])
        n_samples, n_features = np.shape(X)

        # Initialize weights to 1/N
        w = np.full(n_samples, (1 / n_samples))
        
        self.clfs = []
        # Iterate through classifiers
        for _ in range(self.n_clf):
            clf = DecisionStump()
            # Minimum error given for using a certain feature value threshold
            # for predicting sample label
            min_error = float('inf')
            # Iterate throught every unique feature value and see what value
            # makes the best threshold for predicting y
            for feature_i in range(n_features):
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)
                # Try every unique feature value as threshold
                for threshold in unique_values:
                    p = 1
                    # Set all predictions to '1' initially
                    prediction = np.ones(np.shape(y))
                    # Label the samples whose values are below threshold as '-1'
                    prediction[X[:, feature_i] < threshold] = -1
                    # Error = sum of weights of misclassified samples
                    error = sum(w[y != prediction])
                    
                    # If the error is over 50% we flip the polarity
                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    # If this threshold resulted in the smallest error we save the
                    # configuration
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_index = feature_i
                        min_error = error
            
            clf.alpha = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))     
            self.clfs.append(clf)
            # print(clf.feature_index, clf.alpha)
            
            if loss == 'exponential':
                predictions = np.ones(np.shape(y)) # initialize
                negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)
                predictions[negative_idx] = -1
                w *= np.exp(-clf.alpha * y * predictions)
                # the same as: w = np.exp(-y * predictions)
            elif loss == 'logistic':
                sign, predictions = self.predict(X)
                w = 1/(1+np.exp(y * predictions))
            # Normalize to one
            w /= np.sum(w)

    def predict(self, X):
        n_samples = np.shape(X)[0]
        y_pred = np.zeros((n_samples, 1))
        # For each classifier => label the samples
        for clf in self.clfs:
            # Set all predictions to '1' initially
            predictions = np.ones(np.shape(y_pred))
            # The indexes where the sample values are below threshold
            negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)
            # Label those as '-1'
            predictions[negative_idx] = -1
            # Add predictions weighted by the classifiers alpha
            # (alpha indicative of classifier's proficiency)
            y_pred += clf.alpha * predictions

        # Return sign of prediction sum
        y_pred_sign = np.sign(y_pred).flatten()

        return y_pred_sign, y_pred.flatten()
    
    def score(self, y_pred, y_true):
        assert(y_pred.shape == y_true.shape)
        return np.sum(y_pred==y_true, axis=0)/y_pred.shape[0]

# Load data

In [10]:
data = pd.read_csv("abalone.data",names=['sex','length','diameter','height','whole weight','shucked weight',
                                        'viscera weight','shell weight','rings'])
data = data.assign(sex=data.sex.apply(lambda x: 1 if x=='M' else (-1 if x=='F' else 0)))
data = data.assign(rings=data.rings.apply(lambda x: 1 if x <=9 else -1))

In [11]:
train_val = data.values[:3133]
test = data.values[3133:]
train_val.shape, test.shape

((3133, 9), (1044, 9))

In [12]:
train_val_x = train_val[:,:-1]
train_val_y = train_val[:,-1]

test_x = test[:,:-1]
test_y = test[:,-1]

# Cross validation

In [13]:
kf = KFold(n_splits=5)
best_score = -1
best_T = 0

for T in tqdm([10, 20, 30, 40, 50, 100, 200, 300, 400]):
    scores = []
    
    for train_ind, val_ind in kf.split(train_val):
        
        train = train_val[train_ind]
        val = train_val[val_ind]
        
        train_x = train[:,:-1]
        train_y = train[:,-1]
        val_x = val[:,:-1]
        val_y = val[:,-1]
        
        clf = Adaboost(n_clf=T)
        clf.fit(train_x, train_y, loss='logistic')
        y_pred, y_pred_raw = clf.predict(val_x)
        
        score = clf.score(y_pred, val_y)
        scores.append(score)
        
    avg_score = sum(scores)/5
    print(f"{T} {avg_score} {[round(s, 4) for s in scores]}")
    if avg_score > best_score:
        best_score = avg_score
        best_T = T

print(f"Number of classifiers: {best_T}\nCross validation error: {best_score}")

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

10 0.7353837687451275 [0.7257, 0.748, 0.7751, 0.7284, 0.6997]
20 0.7318755063668465 [0.7257, 0.7496, 0.7544, 0.73, 0.6997]
30 0.7376263050888913 [0.7257, 0.7496, 0.7544, 0.7572, 0.7013]
40 0.7366693672898482 [0.7209, 0.7496, 0.7544, 0.7572, 0.7013]
50 0.7366693672898482 [0.7209, 0.7496, 0.7544, 0.7572, 0.7013]
100 0.7376273241919786 [0.7209, 0.7512, 0.7544, 0.7604, 0.7013]


KeyboardInterrupt: 

In [None]:
kf = KFold(n_splits=5)
best_score = -1
best_T = 0

for T in tqdm([100, 200, 300, 400]):
    scores = []
    
    for train_ind, val_ind in kf.split(train_val):
        
        train = train_val[train_ind]
        val = train_val[val_ind]
        
        train_x = train[:,:-1]
        train_y = train[:,-1]
        val_x = val[:,:-1]
        val_y = val[:,-1]
        
        clf = Adaboost(n_clf=T)
        clf.fit(train_x, train_y, loss='logistic')
        y_pred, y_pred_raw = clf.predict(val_x)
        
        score = clf.score(y_pred, val_y)
        scores.append(score)
        
    avg_score = sum(scores)/5
    print(f"{T} {avg_score} {[round(s, 4) for s in scores]}")
    if avg_score > best_score:
        best_score = avg_score
        best_T = T

print(f"Number of classifiers: {best_T}\nCross validation error: {best_score}")

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

100 0.742106179764982 [0.7234, 0.7337, 0.7692]
200 0.742106179764982 [0.7234, 0.7337, 0.7692]
300 0.742106179764982 [0.7234, 0.7337, 0.7692]


In [None]:
clf = Adaboost(n_clf=100)
clf.fit(train_val_x, train_val_y, loss='logistic')
y_pred, y_pred_raw = clf.predict(test_x)
score = clf.score(y_pred, test_y)
print(f'Test accuracy: {score}')

In [None]:
kf = KFold(n_splits=5)
best_score = -1
best_T = 0

for T in tqdm([10, 20, 30, 40, 50, 100, 200, 300, 400]):
    scores = []
    
    for train_ind, val_ind in kf.split(train_val):
        
        train = train_val[train_ind]
        val = train_val[val_ind]
        
        train_x = train[:,:-1]
        train_y = train[:,-1]
        val_x = val[:,:-1]
        val_y = val[:,-1]
        
        clf = Adaboost(n_clf=T)
        clf.fit(train_x, train_y, loss='exponential')
        y_pred, y_pred_raw = clf.predict(val_x)
        
        score = clf.score(y_pred, val_y)
        scores.append(score)
        
    avg_score = sum(scores)/5
    print(f"{T} {avg_score} {[round(s, 4) for s in scores]}")
    if avg_score > best_score:
        best_score = avg_score
        best_T = T

print(f"Number of classifiers: {best_T}\nCross validation error: {best_score}")

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

10 0.7363412160957142 [0.7257, 0.7496, 0.7767, 0.7284, 0.7013]
20 0.7395422188931521 [0.7257, 0.7496, 0.7576, 0.7284, 0.7364]
30 0.7395422188931521 [0.7257, 0.7496, 0.7576, 0.7284, 0.7364]


In [None]:
clf = Adaboost(n_clf=100)
clf.fit(train_val_x, train_val_y, loss='exponential')
y_pred, y_pred_raw = clf.predict(test_x)
score = clf.score(y_pred, test_y)
print(f'Test accuracy: {score}')