In [1]:
# Importing Necessary modules

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
import pandas as pd
import numpy as np

In [2]:
# Loading the data

df=pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv')
df.columns=list('ABCDEFGHI')
df.head(10)

Unnamed: 0,A,B,C,D,E,F,G,H,I
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0
5,3,78,50,32,88,31.0,0.248,26,1
6,10,115,0,0,0,35.3,0.134,29,0
7,2,197,70,45,543,30.5,0.158,53,1
8,8,125,96,0,0,0.0,0.232,54,1
9,4,110,92,0,0,37.6,0.191,30,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       767 non-null    int64  
 1   B       767 non-null    int64  
 2   C       767 non-null    int64  
 3   D       767 non-null    int64  
 4   E       767 non-null    int64  
 5   F       767 non-null    float64
 6   G       767 non-null    float64
 7   H       767 non-null    int64  
 8   I       767 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.isnull().sum()

A    0
B    0
C    0
D    0
E    0
F    0
G    0
H    0
I    0
dtype: int64

Data is already cleaned, so we can directly give it to the model. We can also do preprocessing, to churn out accuracy

In [5]:
x_train,x_test,y_train,y_test=train_test_split(df.drop('I',axis=1),df["I"],test_size=0.2)

In [6]:
x_train

Unnamed: 0,A,B,C,D,E,F,G,H
531,1,86,66,52,65,41.3,0.917,29
30,3,158,76,36,245,31.6,0.851,28
463,10,115,98,0,0,24.0,1.022,34
167,4,110,66,0,0,31.9,0.471,29
41,7,106,92,18,0,22.7,0.235,48
...,...,...,...,...,...,...,...,...
584,1,93,56,11,0,22.5,0.417,22
76,5,95,72,33,0,37.7,0.370,27
591,3,132,80,0,0,34.4,0.402,44
621,6,183,94,0,0,40.8,1.461,45


In [7]:
x_test

Unnamed: 0,A,B,C,D,E,F,G,H
180,0,119,64,18,92,34.9,0.725,23
764,5,121,72,23,112,26.2,0.245,30
655,2,101,58,35,90,21.8,0.155,22
126,1,118,58,36,94,33.3,0.261,23
459,9,120,72,22,56,20.8,0.733,48
...,...,...,...,...,...,...,...,...
287,4,96,56,17,49,20.8,0.340,26
125,3,120,70,30,135,42.9,0.452,30
216,6,125,68,30,120,30.0,0.464,32
217,5,85,74,22,0,29.0,1.224,32


In [8]:
# Extracting the values

x_train=x_train.values
x_test=x_test.values
y_train=y_train.values
y_test=y_test.values

In [9]:
# Node Class for the tree

class Node:
    def __init__(self,p):
        self.feature=None
        self.threshold=None
        self.pred=p
        self.prediction=np.argmax(p)
        self.left=None
        self.right=None

In [10]:
# Creating our custom Decision tree

class DecisionTree:
    
    # training the model
    def fit(self,x,y):
        self.n_classes=len(set(y))
        self.tree=self.growtree(x,y,0)
        self.n_features=x.shape[1]
        
    # code to find the best feature among all the features    
    def split1(self, x, y):
        m = y.size
        if m <= 1:
            return None, None
        num_parent = [np.sum(y == c) for c in range(self.n_classes)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        for idx in range(x.shape[1]):
            thresholds, classes = zip(*sorted(zip(x[:, idx], y)))
            num_left = [0] * self.n_classes
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr
               
    # building the entire tree    
    def growtree(self,x,y,depth):
        n=[np.sum(y==i) for i in range(self.n_classes)]
        node=Node(n)
        ind,t=self.split1(x,y)
        if(depth<200):
            if(ind is not None):
                node.feature=ind
                node.threshold=t
                a=(x[:,ind]<t)
                node.left=self.growtree(x[a],y[a],depth+1)
                node.right=self.growtree(x[~a],y[~a],depth+1)
        return node
    
    # predict function
    def predict(self,x):
        return [self.predict_helper(i) for i in x]
    
    # helper function to predict
    def predict_helper(self,i):
        cur=self.tree
        while(cur.right):
            if(i[cur.feature]>cur.threshold):
                cur=cur.right
            else:
                cur=cur.left
        return cur.prediction       

In [11]:
model=DecisionTree() # Creating the object
model.fit(x_train,y_train) # training it on our data

In [12]:
y_pred=model.predict(x_test)

In [13]:
# Accuracy of the custom coded model on the test-set

print('Accuracy is: ',accuracy_score(y_test,y_pred))

Accuracy is:  0.7207792207792207


In [14]:
# F1-Score of the custom coded model on the test-set

print('F1-Score is: ',f1_score(y_test,y_pred,average='weighted'))

F1-Score is:  0.7140992978313553
