In [29]:
# Importing Necessary modules

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
import pandas as pd
import numpy as np

In [30]:
# Loading the data

df=pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv')
df.columns=list('ABCDEFGHI')
df.head(10)

Unnamed: 0,A,B,C,D,E,F,G,H,I
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0
5,3,78,50,32,88,31.0,0.248,26,1
6,10,115,0,0,0,35.3,0.134,29,0
7,2,197,70,45,543,30.5,0.158,53,1
8,8,125,96,0,0,0.0,0.232,54,1
9,4,110,92,0,0,37.6,0.191,30,0


Data is already cleaned, so we can directly give it to the model. We can also do preprocessing, to churn out accuracy

In [51]:
x_train,x_test,y_train,y_test=train_test_split(df.drop('I',axis=1),df["I"],test_size=0.2)

In [52]:
x_train

Unnamed: 0,A,B,C,D,E,F,G,H
668,9,154,78,30,100,30.9,0.164,45
422,2,115,64,22,0,30.8,0.421,21
488,8,194,80,0,0,26.1,0.551,67
504,10,75,82,0,0,33.3,0.263,38
91,7,81,78,40,48,46.7,0.261,42
...,...,...,...,...,...,...,...,...
348,5,0,80,32,0,41.0,0.346,37
215,5,109,62,41,129,35.8,0.514,25
643,3,103,72,30,152,27.6,0.730,27
399,4,95,64,0,0,32.0,0.161,31


In [53]:
x_test

Unnamed: 0,A,B,C,D,E,F,G,H
282,7,161,86,0,0,30.4,0.165,47
255,3,111,56,39,0,30.1,0.557,30
80,2,74,0,0,0,0.0,0.102,22
569,3,78,70,0,0,32.5,0.270,39
36,9,102,76,37,0,32.9,0.665,46
...,...,...,...,...,...,...,...,...
646,0,179,50,36,159,37.8,0.455,22
173,2,75,64,24,55,29.7,0.370,33
288,5,108,72,43,75,36.1,0.263,33
418,3,129,64,29,115,26.4,0.219,28


In [54]:
# Extracting the values

x_train=x_train.values
x_test=x_test.values
y_train=y_train.values
y_test=y_test.values

In [55]:
# Node Class for the tree

class Node:
    def __init__(self,p):
        self.feature=None
        self.threshold=None
        self.pred=p
        self.prediction=np.argmax(p)
        self.left=None
        self.right=None

In [56]:
# Creating our custom Decision tree

class DecisionTree:
    
    # training the model
    def fit(self,x,y):
        self.n_classes=len(set(y))
        self.tree=self.growtree(x,y,0)
        self.n_features=x.shape[1]
        
    # code to find the best feature among all the features    
    def split1(self, X, y):
        m = y.size
        if m <= 1:
            return None, None
        num_parent = [np.sum(y == c) for c in range(self.n_classes)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        for idx in range(x.shape[1]):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr
               
    # building the entire tree    
    def growtree(self,x,y,depth):
        n=[np.sum(y==i) for i in range(self.n_classes)]
        node=Node(n)
        ind,t=self.split1(x,y)
        if(depth<200):
            if(ind is not None):
                node.feature=ind
                node.threshold=t
                a=(x[:,ind]<t)
                node.left=self.growtree(x[a],y[a],depth+1)
                node.right=self.growtree(x[~a],y[~a],depth+1)
        return node
    
    # predict function
    def predict(self,x):
        return [self.predict_helper(i) for i in x]
    
    # helper function to predict
    def predict_helper(self,i):
        cur=self.tree
        while(cur.right):
            if(i[cur.feature]>cur.threshold):
                cur=cur.right
            else:
                cur=cur.left
        return cur.prediction       

In [57]:
model=DecisionTree() # Creating the object
model.fit(x_train,y_train) # training it on our data

In [58]:
y_pred=model.predict(x_test)

In [59]:
# Accuracy of the custom coded model on the test-set

print('Accuracy is: ',accuracy_score(y_test,y_pred))

Accuracy is:  0.6558441558441559


In [60]:
# F1-Score of the custom coded model on the test-set

print('F1-Score is: ',f1_score(y_test,y_pred,average='weighted'))

F1-Score is:  0.6572507255434086
