In [29]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import time
%load_ext autoreload
%autoreload 2
from proj1_helpers import *
from implementations import *

# Loading the training data
y, tX, ids = load_csv_data('data/train.csv')
tX = standardize(tX)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [86]:
import numpy as np

class nodes:

    def __init__(self):

        self.left = None
        self.right = None
        self.feature = None
        self.threshold = None
        self.loss = None
        self.layer = None
        self.prediction = None

class DecisionTreeClassifier:

    def __init__(self, max_layers = 10,
                 min_samples_leaf = 20,
                 split_batch_size = 20,
                 min_loss_decrease = 0.0):

        self.min_samples_leaf = min_samples_leaf
        self.max_layers = max_layers
        self.split_batch_size = split_batch_size
        self.min_loss_decrease = min_loss_decrease
        self.fitted_tree = None

    @staticmethod
    def _gini(y, N):

        N_1 = sum(y == 1)

        return 1 - (N_1/N)**2 - ((N - N_1)/N)**2

    def _loss_of_split(self, y_left, y_right, N_left, N_right):

        gini_left = self._gini(y_left, N_left)
        gini_right = self._gini(y_right, N_right)

        return (N_left*gini_left + N_right*gini_right) / (N_left + N_right)

    def _best_split(self, y, tX):

        best_gini = 1
        N = y.shape[0]
        random_size = 2*self.min_samples_leaf

        for feature in range(tX.shape[1]):

            for threshold in np.random.choice(tX[:, feature], random_size, replace=False):

                left = np.where(tX[:, feature] < threshold)
                right = np.where(tX[:, feature] >= threshold)

                N_left = left[0].shape[0]
                N_right = right[0].shape[0]

                if (N_left < self.min_samples_leaf) or (N_right < self.min_samples_leaf):

                    continue

                gini_split = self._loss_of_split(y[left], y[right], N_left, N_right)

                if gini_split < best_gini:

                    best_gini = gini_split
                    best_feature = feature
                    best_threshold = threshold
                    best_left = left
                    best_right = right

        return best_feature, best_threshold, best_gini, best_left, best_right

    def _add_new_layer(self, y, tX, layer=0, current_loss=1):

        node = nodes()
        node.feature, node.threshold, node.loss, left, right = self._best_split(y, tX)
        node.layer = layer

        print("----------------")
        print("Layer ", layer)
        print("y = -1: ", sum(y == -1))
        print("y = 1: ", sum(y == 1))

        if ((node.layer < self.max_layers) and
            (current_loss - node.loss > self.min_loss_decrease) and
            (left[0].shape[0] > 2*self.min_samples_leaf) and
            (right[0].shape[0] > 2*self.min_samples_leaf)):

            node.left = self._add_new_layer(y[left], tX[left], node.layer + 1, node.loss)
            node.right = self._add_new_layer(y[right], tX[right], node.layer + 1, node.loss)

        else:

            node.prediction = 1 if (sum(y == 1) > y.shape[0] / 2) else -1
            print("Prediction: ", node.prediction)

        return node

    def _goto_next_layer(self, node, sample):

        if node.prediction is not None:

            return node.prediction

        elif sample[self.fitted_tree.feature] < self.fitted_tree.threshold:

            return self._goto_next_layer(node.left, sample)

        else:

            return self._goto_next_layer(node.right, sample)

    def fit(self, y, tX):

        self.fitted_tree = self._add_new_layer(y, tX)

    def predict(self, tX):

        y_pred = np.empty(tX.shape[0])

        for index in range(tX.shape[0]):

            y_pred[index] = self._goto_next_layer(self.fitted_tree, tX[index])

        return y_pred

In [56]:
reg = DecisionTreeClassifier(max_layers = 2, 
                             min_samples_leaf = 20,
                             split_batch_size = 20)

reg.fit(y[:100000], tX[:100000, :])

y_pred = reg.predict(tX[100000:200000, :])

----------------
Layer  0
y = -1:  65830
y = 1:  34170
----------------
Layer  1
y = -1:  34814
y = 1:  4477
----------------
Layer  2
y = -1:  25882
y = 1:  2087
Prediction:  -1
----------------
Layer  2
y = -1:  8932
y = 1:  2390
Prediction:  -1
----------------
Layer  1
y = -1:  31016
y = 1:  29693
Prediction:  1


In [57]:
print(sum(y_pred == y[100000:200000]) / y_pred.shape[0])
print(sum(y_pred == -1), sum(y_pred == 1))
print(sum(y[100000:200000] == -1), sum(y[100000:200000] == 1))

0.64731
39303 60697
65666 34334


In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC

reg = DTC(max_depth = 3, min_samples_leaf = 

reg.fit(tX[:100000], y[:100000])

y_pred = reg.predict(tX[100000:200000])

print(sum(y_pred == y[100000:200000]) / y_pred.shape[0])
print(sum(y_pred == -1), sum(y_pred == 1))
print(sum(y[100000:200000] == -1), sum(y[100000:200000] == 1))

In [87]:
from sklearn.datasets import make_classification as mc
from sklearn.tree import DecisionTreeClassifier as DTC

reg = DecisionTreeClassifier(max_layers = 3, min_samples_leaf = 10)
#reg = DTC(max_depth = 3, min_samples_leaf = 10)

tX, y = mc(n_samples = 100000, n_features = 10)
y[y == 0] = -1

reg.fit(y[:70000], tX[:70000])
#reg.fit(tX[:70000], y[:70000])

y_pred = reg.predict(tX[70000:])

print(sum(y_pred == y[70000:]) / y_pred.shape[0])
print(sum(y_pred == -1), sum(y_pred == 1))
print(sum(y[70000:] == -1), sum(y[70000:] == 1))

----------------
Layer  0
y = -1:  35024
y = 1:  34976
----------------
Layer  1
y = -1:  31108
y = 1:  2471
----------------
Layer  2
y = -1:  28355
y = 1:  1256
----------------
Layer  3
y = -1:  5520
y = 1:  702
Prediction:  -1
----------------
Layer  3
y = -1:  22835
y = 1:  554
Prediction:  -1
----------------
Layer  2
y = -1:  2753
y = 1:  1215
Prediction:  -1
----------------
Layer  1
y = -1:  3916
y = 1:  32505
----------------
Layer  2
y = -1:  2741
y = 1:  5274
Prediction:  1
----------------
Layer  2
y = -1:  1175
y = 1:  27231
----------------
Layer  3
y = -1:  585
y = 1:  4616
Prediction:  1
----------------
Layer  3
y = -1:  590
y = 1:  22615
Prediction:  1
0.9089666666666667
14422 15578
15009 14991
