In [3]:
import pandas as pd
import numpy as np
import sklearn as sk
import numpy as np

import matplotlib.pyplot as plt
from sklearn import preprocessing


test = pd.read_csv ('df_test.csv', sep = ',')
train = pd.read_csv('df_train.csv', sep = ',')


x_train = train.drop(columns = ['type'])
x_train = preprocessing.scale(x_train)

y_train = train['type']
x_test = test.drop(columns = ['type'])
x_test = preprocessing.scale(x_test)

y_test = test['type']

In [4]:
class Node:
    def __init__(self, gini, samples, samples_pr_class, predict_class):
        self.gini = gini
        self.samples = samples
        self.samples_pr_class = samples_pr_class
        self.predict_class = predict_class
        self.feature_i = 0
        self.split_value = 0
        self.left = None
        self.right = None

In [176]:

X = x_train[:,:]
y = y_train

class DescTree: 
    def __init__(self, max_depth = 10):
        """Takes a max_depth hyperparameter to ensure no overfitting happens"""
        self.max_depth = max_depth #max depth of tree
        self.target_val = 0 #target value that bool_check compares to
        self.best_q = None #best value to spit the dataset on
        self.best_feature = None #best feature to split the dataset in

    def bool_check(self, test_val):
        """Checks if a value is greater or equal to a target value, returns True/False"""
        return test_val >= self.target_val

    def split(self, X):
        """Splits data into left and right arrays (True/False) on a condition checked in our bool_check function """
        left_data, right_data = [], []
        for row in X:
            if self.bool_check(row) == True:
                left_data.append(row)
            else: 
                right_data.append(row)
        return left_data, right_data

    def calculate_gini(self,y):
        """ Calculates a gini_score  """
        classes, count = np.unique(y, return_counts=True)
        gini = 1 
        for p in range(len(classes)):
            pik = count[p]/np.sum(count)
            gini -= pik**2
        return gini
        
    def info_gain(self, left_data, right_data, gini):
        """Calculates an info_gain_score by calling the calculate_gini function"""
        prior = float(len(left_data)) / (len(left_data) + len(right_data))
        info = gini - (prior * self.calculate_gini(left_data)) - ((1-prior)* self.calculate_gini(right_data))
        return info

    def best_split(self, X,y):
        """Finds the best value to split on by itterating through each row in each feature, 
        saves the best info gain, best value and best feature while itterating and 
        returns the best feature + value that gives the highest info gain.  """
        best_info_gain = -10 
        best_bool_check = None
        best_feature = None
        rows, features = X.shape
        for feature in range(features):
            unique = np.unique(X[:,feature])
            for u in unique:
                impurity = self.calculate_gini(y)
                self.target_val = u
                LD, RD = self.split(X[:,feature])
                if len(LD) == 0 or len(RD) == 0:
                    continue
                info = self.info_gain(LD, RD, impurity)

                if info >= best_info_gain:
                    
                    best_info_gain, best_bool_check, best_feature = info, self.target_val, feature

        self.best_feature = best_feature
        self.best_q = best_bool_check
        return self.best_q, self.best_feature, best_info_gain

    def fit(self, X, y):
        """Fits the data, calls all our functions inside this class"""
        self.n_features = X.shape[1] #9 features
        self.n_classes = len(set(y)) #6 classes
        self.tree = self.create_tree(X,y)


    
    def create_tree(self, X, y, depth = 0):
        """ A recursive function that creates a decision tree"""
        cl , samples_pr_class = np.unique(y, return_counts=True)
        yL = list(y)
        most_class = max(set(yL), key=yL.count)

        
        node = Node(gini = self.calculate_gini(y), samples = y.size, samples_pr_class = samples_pr_class, predict_class = most_class)


        #Print statements for debugging
        # print("Node gini", node.gini)
        # print("Node.samples", node.samples)
        # print("Depth Current: ", depth)
        # print("Node samples pr class:", node.samples_pr_class)
        # print("Node.predict_class", node.predict_class)
        # print()
      
        #Making use of depth parameter
        if depth < self.max_depth:
            value, feat, info_gain = self.best_split(X,y)
            # print("info_gain", info_gain)
            # print("split on:" ,value,"in feature:", feat)
            # print() 

            
            
            if feat != None:

                index = X[:,feat] < value


                X_left, y_left = X[index], y[index]
                X_right, y_right = X[~index], y[~index]


                node.feature_i = feat
                node.split_value = value 
                
                node.left = self.create_tree(X_left, y_left, depth + 1)
                
                node.right = self.create_tree(X_right, y_right, depth +1)

        return node

    def predict(self, X):
        """Calls another function that predicts the class for each row in a test dataset """
        predictions = []
        for values in X:
            node = self.tree

            while node.left:
                if values[node.feature_i] < node.split_value:
                    # print("Node.feature_i:", node.feature_i)
                    # print("Values indexed", values[node.feature_i])
                    # print(node.split_value)
                    # print()
                    node = node.left
                    
                else:
                    node = node.right
            predictions.append(node.predict_class)
        return predictions




In [177]:
dt = DescTree(max_depth = 10)
dt.fit(X,y)


split on: -0.25350704215776937 in feature: 7
split on: -0.32483453874467405 in feature: 8
split on: -1.372053373231732 in feature: 2
split on: -0.7484262863508417 in feature: 5
split on: 3.8215647443126586 in feature: 6
split on: 1.2985120877946694 in feature: 6
split on: None in feature: None
split on: 1.5242589044304895 in feature: 6
split on: None in feature: None
split on: None in feature: None
split on: 4.007473887424511 in feature: 6
split on: None in feature: None
split on: None in feature: None
split on: 2.3741292729418118 in feature: 6
split on: 0.8270996177610443 in feature: 6
split on: None in feature: None
split on: 2.2014993543379497 in feature: 6
split on: None in feature: None
split on: None in feature: None
split on: 3.635655601200807 in feature: 6
split on: None in feature: None
split on: None in feature: None
split on: -0.8012355286553132 in feature: 5
split on: 0.6810281481731604 in feature: 6
split on: 0.3756059844894043 in feature: 6
split on: None in feature: None

In [174]:
from sklearn.metrics import accuracy_score
k = dt.predict(x_test)


In [175]:
accuracy_score(k, y_test)

0.5846153846153846

In [170]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=10)
clf.fit(X,y)
p = clf.predict(x_test)


[2, 1, 3, 7, 2, 7, 1, 1, 2, 1, 3, 1, 2, 2, 1, 2, 7, 2, 2, 7, 5, 7, 1, 3, 2, 2, 1, 2, 1, 2, 5, 1, 1, 1, 7, 1, 7, 3, 1, 2, 3, 1, 7, 1, 5, 1, 5, 2, 2, 2, 6, 1, 1, 3, 2, 5, 2, 2, 2, 7, 1, 2, 1, 2, 2]


In [168]:
accuracy_score(p, y_test)

0.6461538461538462

In [171]:
print(k)
print(list(p))

[1, 1, 1, 7, 1, 7, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 7, 2, 5, 7, 5, 7, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 7, 1, 2, 5, 1, 2, 1, 1, 7, 1, 5, 2, 5, 1, 1, 1, 1, 1, 1, 1, 1, 5, 2, 2, 2, 7, 1, 2, 1, 7, 1]
[2, 1, 3, 7, 2, 7, 1, 1, 2, 1, 3, 1, 2, 2, 1, 2, 7, 2, 2, 7, 5, 7, 1, 3, 2, 2, 1, 2, 1, 2, 5, 1, 1, 1, 7, 1, 7, 3, 1, 2, 3, 1, 7, 1, 5, 1, 5, 2, 2, 2, 6, 1, 1, 3, 2, 5, 2, 2, 2, 7, 1, 2, 1, 2, 2]
