# 1. Imports
#

In [1]:
from random import randint

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import BaseDecisionTree
import pandas as pd
import numpy as np


# 2. Classes and functions
#

In [2]:
class Bagging:
    def __init__(
            self,
            dataset: pd.DataFrame,
            y_name: str,
            tree: BaseDecisionTree,
            size: int = 10
            ) -> None:

        self.trees: list[BaseDecisionTree] = []
        self.tree = tree
        self.df = dataset
        self.y_name = y_name
        self.size = size

        self._set_criterion()

        self._boostrap()

    def _set_criterion(self) -> None:
        if self.tree == DecisionTreeClassifier:
            self.criterion = "entropy"
        elif self.tree == DecisionTreeRegressor:
            self.criterion = "squared_error"
        else:
            msg = "Invalid Tree type! Available are: (DecisionTreeClassifier, "
            msg += f"DecisionTreeRegressor) but was given '{type(self.tree)}'."
            raise Exception(msg)

    def _single_boostrap(self) -> BaseDecisionTree:
        x = self.df.drop([self.y_name], axis=1)
        y = self.df[self.y_name]

        new_x = []
        new_y = []
        records: int = len(self.df)
        for _ in range(records):
            index = randint(0, records-1)

            new_x.append(x.iloc[index])
            new_y.append(y.iloc[index])

        new_x = pd.DataFrame(new_x, columns=list(x.columns))
        new_y = pd.DataFrame(new_y, columns=[self.y_name])

        new_tree = self.tree(criterion=self.criterion)
        new_tree.fit(new_x, new_y)

        return new_tree

    def _boostrap(self) -> None:
        for _ in range(self.size):
            new_tree = self._single_boostrap()
            self.trees.append(new_tree)

    def _predict_classifier(self, x) -> int:
        values = []
        for tree in self.trees:
            pred = tree.predict(x)
            values.append(pred)

        if sum(values) > len(values)/2:
            return 1
        return 0

    def _predict_regresor(self, x) -> float:
        values = []
        for tree in self.trees:
            pred = tree.predict(x).astype(float)[0]
            values.append(pred)
        return np.mean(np.array(values))

    def predict(self, x) -> float:
        if self.tree == DecisionTreeClassifier:
            return self._predict_classifier(x)
        elif self.tree == DecisionTreeRegressor:
            return self._predict_regresor(x)


# 3. Test on 'SAheart.data' dataset
#

### 3.1. Bagging test

In [3]:
df = pd.read_csv("data/SAheart.data")
df = df.sample(frac=1)  # shuffle

df = pd.get_dummies(df, columns=["famhist"])

train_size = int(len(df)*0.7)

df_train = df.iloc[:train_size, :]
df_test = df.iloc[train_size:, :]

x_test = df_test.drop(["chd"], axis=1)

columns = list(x_test.columns)

y_test = df_test["chd"]

print(df.shape)
print(df_train.shape)
print(df_test.shape)


(462, 12)
(323, 12)
(139, 12)


In [4]:
my_bagging = Bagging(df_train, "chd", DecisionTreeClassifier, 30)

good = 0
all_ = 0

for i in range(len(x_test)):
    x = x_test.iloc[i].values.reshape(1, -1)
    x = pd.DataFrame(x, columns=columns)
    y = y_test.iloc[i]

    y_pred = my_bagging.predict(x)

    if y_pred == y:
        good += 1
    all_ += 1

print("Bagging accuracu:", good/all_*100)


Bagging accuracu: 66.18705035971223


### 3.2. Single tree test

In [5]:
df = pd.read_csv("data/SAheart.data")
df = df.sample(frac=1)  # shuffle

df = pd.get_dummies(df, columns=["famhist"])

train_size = int(len(df)*0.7)

df_train = df.iloc[:train_size, :]
df_test = df.iloc[train_size:, :]

x_train = df_train.drop(["chd"], axis=1)
y_train = df_train["chd"]

columns = list(x_test.columns)

y_test = df_test["chd"]


In [6]:
tree = DecisionTreeClassifier(criterion="entropy")

tree.fit(x_train, y_train)

good = 0
all_ = 0

for i in range(len(df_test)):
    x = x_test.iloc[i].values.reshape(1, -1)
    x = pd.DataFrame(x, columns=columns)
    y = y_test.iloc[i]

    y_pred = my_bagging.predict(x)

    if y_pred == y:
        good += 1
    all_ += 1

print("Single tree accuracy:", good/all_*100)


Single tree accuracy: 56.83453237410072


# 3. Test on 'fitness.txt' dataset
#

Draw...

In [7]:
df = pd.read_csv("data/fitness.txt", sep="\\s+")
df = df.sample(frac=1)  # shuffle

train_size = int(len(df)*0.7)

df_train = df.iloc[:train_size, :]
df_test = df.iloc[train_size:, :]

# print(df.shape)
# print(df_train.shape)
# print(df_test.shape)

my_bagging = Bagging(df_train, "Oxygen", DecisionTreeRegressor, 30)

x_test = df_test.drop(["Oxygen"], axis=1)

columns = list(x_test.columns)

y_test = df_test["Oxygen"]

good = 0
all_ = 0

for i in range(len(x_test)):
    x = x_test.iloc[i].values.reshape(1, -1)
    x = pd.DataFrame(x, columns=columns)
    y = y_test.iloc[i]

    y_pred = my_bagging.predict(x)
    print(y_pred, y)

    if y_pred == y:
        good += 1
    all_ += 1

print(good/all_*100)


48.08366666666666 47.273
45.060333333333325 45.681
45.35176666666666 46.08
49.27763333333334 50.545
45.028666666666666 44.609
49.155 46.672
55.33139999999999 60.055
49.38743333333333 45.441
40.89769999999999 39.407
48.42846666666666 50.541
0.0
