<a href="https://colab.research.google.com/github/KhaledBadran/FairBoost/blob/initial_design/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
import random

# from FairBoost import FairBoost

In [None]:
def function1(instance):
    return instance

def function2(instance):
    return instance * [[random.random()] for _ in range(len(instance))]

preprocessing1= lambda data:function1(data)
preprocessing2= lambda data:function2(data)
preprocessing = (preprocessing1, preprocessing2)

In [None]:
data = load_breast_cancer()
X    = data.data
y    = data.target
model = DecisionTreeClassifier(class_weight='balanced')

## declare an ensemble instance with default parameters ##

data = {'X': X, 'y': y}
ens = FairBoost(data, model, preprocessing)

## train the ensemble & view estimates for prediction error ##
ens.train_models()
ens.predict()

0.7017543859649122 0.9777777777777777 0.5714285714285714


array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0])

In [59]:
## imports ##
import numpy as np
from sklearn.base import clone
from sklearn.metrics import accuracy_score, precision_score, recall_score
import scipy.spatial.distance as dist


from enum import Enum
class Bootstrap_type(Enum):
    NONE=1
    DEFAULT=2
    CUSTOM=3

## Note: FairBoost tries to replicate sklearn API
class FairBoost(object):
    def __init__(self, model, preprocessing_functions, bootstrap_type=Bootstrap_type.DEFAULT, bootstrap_size=0.63):
        self.model = model
        self.preprocessing_functions = preprocessing_functions
        self.n_elements = len(preprocessing_functions)
        self.bootstrap_size = bootstrap_size
        self.bootstrap_type = bootstrap_type

        # The trained models
        self.models = []
        # TODO: consider other distance functions
        self.dist_func = dist.cosine
        # ipdb.set_trace(context=6)
        

    # Generates all "cleaned" data sets
    # Returns an array of (X,y)
    def __preprocess_data(self, X, y):
        pp_data = []
        for ppf in self.preprocessing_functions:
            pp_data.append(ppf(X, y))
        return pp_data

    def __get_avg_dist_arr(self, data):
        # Swap the first two dimensions so we iterate over instances instead of data sets
        data = data.transpose([1, 0, 2])
        # Initializing the average distances array
        dist_arr = np.zeros(
            shape=(len(data), len(self.preprocessing_functions)))
        # Fill the avg distances array
        for i, pp_instances in enumerate(data):
            for j, pp_instance_j in enumerate(pp_instances):
                distances = []
                for k, pp_instance_k in enumerate(pp_instances):
                    d = self.dist_func( pp_instance_j, pp_instance_k)
                    d = np.abs(d)
                    distances.append(d)
                dist_arr[i, j] = np.mean(distances)
        dist_arr = dist_arr.transpose([1, 0])
        # Normalize
        n_dist_arr = []
        for arr in dist_arr:
            s = np.sum(arr)
            n = arr/s
            n_dist_arr.append(n)
        return n_dist_arr

    # Adds y to the last column of X for a list of (X,y)
    def __merge_Xy(self, datasets):
        res = []
        for dataset in datasets:
            X, y = dataset[0], np.expand_dims(dataset[1], axis=-1)
            m = np.concatenate([X, y], axis=-1)
            res.append(m)
        return np.array(res)

    # Generate the boostrap data sets
    # Returns a list of (X,y)
    def __bootstrap_datasets(self, X, y):
        datasets = self.__preprocess_data(X, y)
        datasets = self.__merge_Xy(datasets)
        # If we do the custom bootstrapping, we must define a custom PDF
        if self.bootstrap_type == Bootstrap_type.CUSTOM:
            dist_arrays = self.__get_avg_dist_arr(datasets)
        else:
            dist_arrays = [None for _ in range(len(datasets))]

        bootstrap_datasets = []
        for dataset, dist_arr in zip(datasets, dist_arrays):
            indexes = [i for i in range(len(dataset))]
            size = int(self.bootstrap_size*len(dataset))
            indexes = np.random.choice(indexes, size=size, replace=True, p=dist_arr)
            bootstrap_datasets.append((dataset[indexes,:-1], dataset[indexes,-1]))

        return bootstrap_datasets

    def fit(self, X, y):
        datasets = self.__bootstrap_datasets(X,y)
        for X_bootstrap, y_bootstrap in datasets:
            model = clone(self.model)
            model.fit(X_bootstrap, y_bootstrap)
            self.models.append(model)
        return self

    def evaluate_models(self):
        accs = np.array([])
        pres = np.array([])
        recs = np.array([])
        for i in range(len(self.models)):
            yp = self.models[i].predict(self.preprocessed_data[i]['test'][0])
            acc = accuracy_score(self.preprocessed_data[i]['test'][1], yp)
            pre = precision_score(self.preprocessed_data[i]['test'][1], yp)
            rec = recall_score(self.preprocessed_data[i]['test'][1], yp)
            # store the error metrics
            accs = np.concatenate((accs, acc.flatten()))
            pres = np.concatenate((pres, pre.flatten()))
            recs = np.concatenate((recs, rec.flatten()))
        print(accs, pres, recs)

    # def predict(self):
    #     predictions = []
    #     for i in range(len(self.models)):
    #         yp = self.models[i].predict(self.preprocessed_data[i]['test'][0])
    #         predictions.append(yp.reshape(-1, 1))
    #     ypred = np.round(np.mean(np.concatenate(
    #         predictions, axis=1), axis=1)).astype(int)
    #     acc = accuracy_score(self.preprocessed_data[0]['test'][1], ypred)
    #     pre = precision_score(self.preprocessed_data[0]['test'][1], ypred)
    #     rec = recall_score(self.preprocessed_data[0]['test'][1], ypred)
    #     print(acc, pre, rec)
    #     return (ypred)

    def predict(self, X):
        y_pred = []
        for i in range(len(self.models)):
            y_pred.append(self.models[i].predict(X))
        # Computing a soft majority voting
        y_pred = np.array(y_pred).transpose()
        y_pred = np.mean(y_pred, axis=-1).astype(int)
        return y_pred

In [3]:
!pip install ipdb
import  ipdb

Collecting ipdb
  Downloading ipdb-0.13.9.tar.gz (16 kB)
Collecting ipython>=7.17.0
  Downloading ipython-7.32.0-py3-none-any.whl (793 kB)
[K     |████████████████████████████████| 793 kB 9.0 MB/s 
[?25hCollecting toml>=0.10.2
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0
  Downloading prompt_toolkit-3.0.28-py3-none-any.whl (380 kB)
[K     |████████████████████████████████| 380 kB 46.5 MB/s 
Building wheels for collected packages: ipdb
  Building wheel for ipdb (setup.py) ... [?25l[?25hdone
  Created wheel for ipdb: filename=ipdb-0.13.9-py3-none-any.whl size=11648 sha256=18f1bbb16e15ae28bbc9c8f9533412623e58ebb336aa9a0fd8e98f731625ec20
  Stored in directory: /root/.cache/pip/wheels/65/cd/cc/aaf92acae337a28fdd2aa4d632196a59745c8c39f76eaeed01
Successfully built ipdb
Installing collected packages: prompt-toolkit, toml, ipython, ipdb
  Attempting uninstall: prompt-toolkit
    Found existing installation: prompt-toolkit 1.

In [4]:
%pdb on

Automatic pdb calling has been turned ON


In [14]:
def function1(instance, label):
    return instance, label

def function2(instance, label):
    return instance * [[random.random()] for _ in range(len(instance))], label

preprocessing = (function1, function2)

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

data = load_breast_cancer()
X    = data.data
y    = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model = LogisticRegression()

ens = FairBoost(model, preprocessing)
ens = ens.fit(X_train,y_train)
y_pred = ens.predict(X_test)
accuracy_score(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9468085106382979

In [72]:
# Baseline
model = DecisionTreeClassifier(class_weight='balanced')
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9042553191489362