# 1. Data Preparation

In [13]:
from random import sample
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
def loadDataSet():
    train = pd.read_table("horseColicTraining.txt", header=None)
    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    test = pd.read_table("horseColicTest.txt", header=None)
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]
    X_train = np.array(X_train, dtype=np.float64)
    X_test = np.array(X_test, dtype=np.float64)
    y_train = np.array(y_train, dtype=np.int32)
    y_test = np.array(y_test, dtype=np.int32)
    return X_train, y_train, X_test, y_test

# 2. Criterion -- Gini Index

In [14]:
def calculate_weighted_counts(y, sample_weight, classes_):
    '''
    the function used to calculate the summation of weights of samples from each class. Generally speaking,
    the weights are all set as one. But for Adaboost, each sample has different values.
    '''
    class_counts = np.zeros(shape=classes_.shape[0], dtype=np.float64)
    for i, label in enumerate(classes_):
        idx = y == label
        if idx.sum() > 0:
            class_counts[i] = sample_weight[idx].sum()
        else:
            class_counts[i] = 0
    return class_counts

def gini(y, sample_weight):
    classes_ = np.unique(y)
    class_counts = calculate_weighted_counts(y, sample_weight, classes_)
    if class_counts.sum() > 0:
        pk = class_counts / class_counts.sum()
        pk = pk[pk > 0]
        return 1 - np.sum(pk**2)
    else:
        return 0

def gini_index(X, y, feat, point, sample_weight):
    '''
    calculate the difference of gini index before and after splitting
    '''
    S = gini(y, sample_weight)
    new_S = 0
    n = sample_weight.sum()
    assert n > 0
    idx1 = X[:, feat] < point
    nv = sample_weight[idx1].sum()
    if nv > 0:
        new_S += nv / n * gini(y[idx1], sample_weight[idx1])
    idx2 = X[:, feat] >= point
    nv = sample_weight[idx2].sum()
    if nv > 0:
        new_S += nv / n * gini(y[idx2], sample_weight[idx2])
    return S - new_S

# 3. Decision Trees for Classification

Different from the classification tree implemented in the last tutorial:
1. Each internal node has two child nodes regardless of values of the splitting feature are continuous or discrete.
2. Add the parameter `max_depth` for providing another condition to stop splitting procedures.
3. Add the parameter `max_features` to use the subset of features to build decision tree.

Options 2&3 are designed for constructing trees in the random forest implemented in Section 5. 

If you'd like to build the classification tree, you can ignore options 2&3 by setting `max_depth = None` and `max_features = None`. Then we can combine it with the pre-pruning or post-pruning technique implemented in Section 4 to prevent overfitting.

In [15]:
class DecisionTreeClassifier(object):
    '''
    This class is for classification tree

    Attributes:
        - criterion: a function used as the criterion of classification tree
        - tree: a nested dictionary representing the decision tree structure.
        - max_depth: the parameter to control the depth of tree. If the depth is larger than max_depth, we will stop splitting.
        - max_feature: the number of selected features to build decision tree
    '''
    def __init__(self,
                 criterion=gini_index,
                 max_depth=None,
                 max_features=None,
                 random_seed=None):
        self.f_criterion = criterion
        self.max_depth = max_depth
        if self.max_depth is None:
            self.max_depth = 2**10
        self.max_features = max_features
        self.random_seed = random_seed

    def fit(self, X, y, sample_weight=None):
        np.random.seed(self.random_seed)
        num_samples, num_features = X.shape
        if self.max_features is None:
            self.max_features = num_features
        elif self.max_features == "sqrt":
            self.max_features = np.int(np.round(np.sqrt(num_features)))
        self.classes_ = np.unique(y)
        if sample_weight is None:
            sample_weight = np.ones(num_samples, dtype=np.float64)
        # build the decision tree
        self.tree = self.create_tree(X, y, sample_weight, depth=0)

    def create_tree(self, X, y, sample_weight, depth):
        Tree = {}
        Tree["depth"] = depth
        class_counts = calculate_weighted_counts(y, sample_weight, self.classes_)
        # create a leaf node if all samples belong to the same class
        if (class_counts != 0).sum() == 1:
            Tree["is_leaf"] = True
            Tree["pred"] = self.classes_[class_counts != 0]
        # using the majority vote to get the prediction at each node
        majority_class = self.classes_[np.argmax(class_counts)]
        Tree["pred"] = majority_class
        # create a leaf node if feature set is empty
        feat, point = self.choose_best_split(X, y, sample_weight)
        if feat is None or depth == self.max_depth:
            Tree["is_leaf"] = True
            return Tree
        # otherwise, create an internal node
        Tree["is_leaf"] = False
        Tree["split_feat"] = feat
        Tree["split_point"] = point
        # build the left subtree
        idx = X[:, feat] < point
        Tree["left"] = self.create_tree(X[idx], y[idx], sample_weight[idx],
                                        depth + 1)
        # build the right subtree
        idx = X[:, feat] >= point
        Tree["right"] = self.create_tree(X[idx], y[idx], sample_weight[idx],
                                         depth + 1)
        return Tree

    def choose_best_split(self, X, y, sample_weight):
        # initialization
        best_feat, best_point = None, None
        best_score = 0.0
        # search for each candidate feature
        num_features = X.shape[1]
        if self.max_features < num_features:
            candidate_feat = np.random.permutation(
                num_features)[:self.max_features]
        else:
            candidate_feat = np.arange(num_features)
        for feat in candidate_feat:
            # if all values of this feature are equal, do not split this feature
            X_feat_value = np.unique(X[:, feat])
            if len(X_feat_value) == 1:
                continue
            # search for each possible split point
            for i in range(len(X_feat_value) - 1):
                # divide the dataset into two parts according to the split
                point = (X_feat_value[i] + X_feat_value[i + 1]) / 2.0
                # calculate score to evaluate the quality of a split
                score = self.f_criterion(X, y, feat, point, sample_weight)
                if score > best_score:
                    best_feat = feat
                    best_point = point
                    best_score = score
        return best_feat, best_point

    def predict(self, X):
        '''
        function used to fit the decision tree classifier

        Args:
            X - features of test samples, a pandas dataframe with shape (n, d)

        Returns:
            y - predictions of test samples, a pandas series with shape (n,)
        '''
        n = X.shape[0]
        y = []
        for i in range(n):
            y.append(DecisionTreeClassifier.predict_each(X[i], self.tree))
        y = np.array(y, dtype=np.int32)
        return y

    @staticmethod
    def predict_each(x, tree):
        '''
        for each sample, get the prediction of decision tree classifier in a recursive manner.

        Args:
            x - features of a sample, a pandas Series with shape (d,)
            tree - a nested dictionary representing the decision tree structure.

        Returns:
            the prediction of the sample `x`
        '''
        if tree["is_leaf"] is True:
            # if the `tree` is a leaf node, get the prediction at the leaf node
            return tree["pred"]
        else:
            # the 'tree' is a nested dictionary
            # get the value of the feature used to split
            feat = tree["split_feat"]
            point = tree["split_point"]
            # get the value of the feature for the sample `x`
            value = x[feat]
            if value < point:
                return DecisionTreeClassifier.predict_each(x, tree["left"])
            else:
                return DecisionTreeClassifier.predict_each(x, tree["right"])

In [16]:
X_train, y_train, X_test, y_test = loadDataSet()
model = DecisionTreeClassifier(criterion=gini_index,
                                   max_depth=None,
                                   max_features=None,
                                   random_seed=None)
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)
acc_train = (y_train == y_train_hat).mean()
acc_test = (y_test == y_test_hat).mean()
print("The accuracy of training data is:", acc_train)
print("The accuracy of test data is:", acc_test)

The accuracy of training data is: 0.9966555183946488
The accuracy of test data is: 0.6268656716417911


You maybe find that the accuracy of training data is almost one but the accuracy of test data is low. The reason is that the decision tree overfits to the training data. To prevent overfitting, we introduce the post-pruning technique in the next section.

# 4. Classification Tree with the Pruning Technique

In [17]:
def pruning(tree, classes_, X_valid, y_valid):
    '''
    the function used to post-prune the decision tee

    Args:
        tree - a nested dictionary representing the decision tree structure.
        classes_ - names of all classes 
        X_valid - the features of the validation samples
        y_valid - the labels of the validation samples
    Returns:
        the tree structure after pruning
   '''
    if X_valid.shape[0] == 0:
        new_tree = {}
        new_tree["is_leaf"] = True
        new_tree["pred"] = tree["pred"]
        return new_tree
    if tree["is_leaf"] is True:
        return tree
    feat = tree["split_feat"]
    point = tree["split_point"]
    idx1 = X_valid[:, feat] < point
    tree["left"] = pruning(tree["left"], classes_, X_valid[idx1],
                           y_valid[idx1])
    idx2 = X_valid[:, feat] >= point
    tree["right"] = pruning(tree["right"], classes_, X_valid[idx2],
                            y_valid[idx2])
    if tree["left"]["is_leaf"] is True and tree["right"]["is_leaf"] is True:
        FLAG = True
    else:
        FLAG = False
    if FLAG:
        # check validation accuracy gap
        valid_y_true = []
        valid_y_pred = []
        # make prediction and calculate validation accuracy of the tree before merging
        child_majority_class = tree["left"]["pred"]
        idx1 = X_valid[:, feat] < point
        if idx1.sum() > 0:
            valid_y_true.append(y_valid[idx1])
            valid_y_pred.append([child_majority_class] * idx1.sum())
        child_majority_class = tree["right"]["pred"]
        idx2 = X_valid[:, feat] >= point
        if idx2.sum() > 0:
            valid_y_true.append(y_valid[idx2])
            valid_y_pred.append([child_majority_class] * idx2.sum())
        valid_y_true = np.concatenate(valid_y_true)
        valid_y_pred = np.concatenate(valid_y_pred)
        valid_acc_before = np.mean(valid_y_true == valid_y_pred)
        # make prediction and calculate validation accuracy of the tree after merging
        majority_class = tree["pred"]
        valid_y_pred = np.array([majority_class] * X_valid.shape[0])
        valid_acc_after = np.mean(valid_y_true == valid_y_pred)
        # if the validation accuracy after merging is larger, we will prune
        if valid_acc_after > valid_acc_before:
            new_tree = {}
            new_tree["is_leaf"] = True
            new_tree["pred"] = tree["pred"]
            return new_tree
        else:
            return tree
    else:
        return tree

In [18]:
X_train, y_train, X_test, y_test = loadDataSet()
X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.3,
                                                      stratify=y_train,
                                                      random_state=3147)
model = DecisionTreeClassifier(criterion=gini_index,
                                   max_depth=None,
                                   max_features=None,
                                   random_seed=None)
model.fit(X_train, y_train)
# without pruning
y_train_hat = model.predict(X_train)
y_valid_hat = model.predict(X_valid)
y_test_hat = model.predict(X_test)
acc_train = (y_train == y_train_hat).mean()
acc_valid = (y_valid == y_valid_hat).mean()
acc_test = (y_test == y_test_hat).mean()
print("Training accuracy of the classification tree Without pruning is:", acc_train)
print("Validation accuracy of the classification tree Without pruning is:", acc_valid)
print("Testing accuracy of the classification tree Without pruning is:", acc_test, '\n')
# with pruning
model.tree = pruning(model.tree, model.classes_, X_valid, y_valid)
y_train_hat = model.predict(X_train)
y_valid_hat = model.predict(X_valid)
y_test_hat = model.predict(X_test)
acc_train = (y_train == y_train_hat).mean()
acc_valid = (y_valid == y_valid_hat).mean()
acc_test = (y_test == y_test_hat).mean()
print("Training accuracy of the classification tree With pruning is:", acc_train)
print("Validation accuracy of the classification tree With pruning is:", acc_valid)
print("Testing accuracy of the classification tree With pruning is:", acc_test)

Training accuracy of the classification tree Without pruning is: 1.0
Validation accuracy of the classification tree Without pruning is: 0.6111111111111112
Testing accuracy of the classification tree Without pruning is: 0.6716417910447762 

Training accuracy of the classification tree With pruning is: 0.9521531100478469
Validation accuracy of the classification tree With pruning is: 0.6777777777777778
Testing accuracy of the classification tree With pruning is: 0.7164179104477612


# 5. Random Forest for Classification

In this section, we implement the random forest where each tree is built with the class DecisionTreeClassifier(). In our model, the values of parameters are listed below. 
1. The number of trees $T$ is set as ``num_estimators = 20``
2. the number of subsampled features for each tree is $k =\sqrt{d}$, which corresponds to ``max_features = "sqrt"`` in the code.
3. The maximum depth of each tree is ``max_depth = 6``.

We will not use the pruning technique for each tree in the random forest.

In [19]:
class RandomForestClassifier(object):
    '''
    This class is for random forest classification

    Attributes:
        - criterion: a function used as the criterion of classification tree
        - num_estimators: the number of trees in the random forest 
        - tree: a nested dictionary representing the decision tree structure
        - max_depth: the parameter to control the depth of tree. If the depth is larger than max_depth, we will stop splitting.
        - max_feature: the number of selected features to build decision tree
    '''
    def __init__(self,
                 num_estimators,
                 random_state,
                 criterion=gini_index,
                 max_depth=None,
                 max_features="sqrt"):
        self.num_estimators = num_estimators
        self.random_state = random_state
        self.criterion = criterion
        self.max_depth = max_depth
        self.max_features = max_features

    def fit(self, X, y):
        '''
        function used to fit all trees in the random forest
        
        Args:
            X - the features of the training samples
            y - the labels of the training samples
        Returns:
            self.model_list - the model list containing `num_estimators` tree models
        '''
        n, d = X.shape
        RandomState = np.random.RandomState(self.random_state)
        self.model_list = []
        for t in range(self.num_estimators):
            random_seed = RandomState.randint(0, np.iinfo(np.int32).max)
            ### draw a bootstrapped dataset from X
            sample_index = RandomState.choice(np.arange(n), size=n, replace=True)
            X_sampled = X[sample_index, :]
            y_sampled = y[sample_index]
            ### initialize the tree model by using DecisionTreeClassifier()
            model = DecisionTreeClassifier(criterion=self.criterion,
                                           max_depth=self.max_depth,
                                           max_features=self.max_features,
                                           random_seed=random_seed)
            ### fit the tree model to the bootstrapped dataset by using DecisionTreeClassifier.fit()
            model.fit(X_sampled, y_sampled)
            self.model_list.append(model)
        return self.model_list

    def predict(self, X):
        '''
        function used to predict the labels of X
        
        Args:
            X - the features of the test samples
        Returns:
            y_pred_label - the predicted labels of test samples
        '''
        n = X.shape[0]
        y_pred = np.zeros([self.num_estimators, n], dtype=np.int32)
        y_pred_label = np.zeros(n, dtype=np.int32)
        ### use T tree classifiers to make predictions by using DecisionTreeClassifier.predict()
        for i in range(self.num_estimators):
            model_i = self.model_list[i]
            y_pred[i, :] = model_i.predict(X)
        ### take the majority vote 
        for i in range(n):
            classes, count = np.unique(y_pred[:, i], return_counts=True)
            y_pred_label[i] = classes[np.argmax(count)]
        return y_pred_label

In [20]:
X_train, y_train, X_test, y_test = loadDataSet()
model = RandomForestClassifier(num_estimators=20,
                                   random_state=101,
                                   criterion=gini_index,
                                   max_depth=6)
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)
acc_train = (y_train == y_train_hat).mean()
acc_test = (y_test == y_test_hat).mean()
print("Training accuracy of the random forest is:", acc_train)
print("Testing accuracy of the random forest is:", acc_test)

Training accuracy of the random forest is: 0.9431438127090301
Testing accuracy of the random forest is: 0.8059701492537313


# 6. Adaboost

In the Adaboost, we use the same parameter values as in random forest, that is, ``num_estimators = 20`` and ``max_depth = 6``. 

In [21]:
class Adaboost(object):
    '''
    This class is for random forest classification

    Attributes:
        - criterion: a function used as the criterion of classification tree
        - num_estimators: the number of iterations in the Adaboost
        - tree: a nested dictionary representing the decision tree structure
        - max_depth: the parameter to control the depth of tree. If the depth is larger than max_depth, we will stop splitting.
        - model weight : a vector to store the weights of each model
    '''
    def __init__(self,
                 num_estimators,
                 criterion=gini_index,
                 max_depth=None):
        self.num_estimators = num_estimators
        self.criterion = criterion
        self.max_depth = max_depth

    def fit(self, X, y):
        '''
        the function used to fit the decision tree, calculate the model $\alpha_t$ and update the sample weights in 
        each iteration.
        
        Args:
            X - the features of the training samples
            y - the labels of the training samples
        Returns:
            self.model_list - the model list containing `num_estimators` tree models
        '''
        n, d = X.shape
        sample_weight = np.ones(n) / n
        self.model_weight = np.ones(self.num_estimators)
        self.model_list = []
        for t in range(self.num_estimators):
            ### initialize the tree model by using DecisionTreeClassifier()
            model_t = DecisionTreeClassifier(criterion=self.criterion,
                                             max_depth=self.max_depth)
            ### fit the tree model to the weighted samples by using DecisionTreeClassifier.fit()
            model_t.fit(X, y, sample_weight=sample_weight)
            ### make predictions by using DecisionTreeClassifier.predict()
            y_pred_t = model_t.predict(X)
            ### add the fitted model to the "model_list"
            self.model_list.append(model_t)
            ### calculate the weighted misclassification error $\epsilon_t$
            mis_classify_flag = np.where(y != y_pred_t)[0]
            epsilon_t = np.sum(sample_weight[mis_classify_flag]) / np.sum(sample_weight)
            ### calculate the model weight $\alpha_t$
            self.model_weight[t] = 0.5 * np.log(1 / epsilon_t - 1)
            ### update the sample weight w_i
            mis_classify_vec = np.zeros(n)
            mis_classify_vec[mis_classify_flag] = 1
            sample_weight = sample_weight * np.exp(self.model_weight[t] * mis_classify_vec)


    def predict(self, X):
        '''
        function used to predict the labels of X
        
        Args:
            X - the features of the test samples
        Returns:
            y_pred_label - the predicted labels of test samples
        '''
        y_pred = np.zeros([self.num_estimators, X.shape[0]])
        for t in range(self.num_estimators):
            model = self.model_list[t]
            y_pred_temp = model.predict(X)
            y_pred_temp = y_pred_temp * self.model_weight[t]
            y_pred[t, :] = y_pred_temp
        y_pred_res = np.sum(y_pred, axis=0)
        y_pred_label = -np.ones(X.shape[0])
        y_pred_label[np.where(y_pred_res > 0)[0]] = 1
        return y_pred_label

In [22]:
X_train, y_train, X_test, y_test = loadDataSet()
y_train[np.where(y_train == 0)[0]] = -1
y_test[np.where(y_test == 0)[0]] = -1
model = Adaboost(num_estimators=20, criterion=gini_index, max_depth=6)
model.fit(X_train, y_train)
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)
acc_train = (y_train == y_train_hat).mean()
acc_test = (y_test == y_test_hat).mean()
print("Training accuracy of Adaboost is:", acc_train)
print("Testing accuracy of Adaboost is:", acc_test)

Training accuracy of Adaboost is: 0.9966555183946488
Testing accuracy of Adaboost is: 0.7761194029850746
