In [16]:
import pandas as pd
import numpy as np


In [17]:
col_names=['sepal_length','sepal_width','petal_width','petal_length','type']
data=pd.read_csv("../Downloads/Iris_dataset/Iris.csv", skiprows=1, header=None, names=col_names)
data.head(20)

Unnamed: 0,sepal_length,sepal_width,petal_width,petal_length,type
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa
6,5.4,3.9,1.7,0.4,Iris-setosa
7,4.6,3.4,1.4,0.3,Iris-setosa
8,5.0,3.4,1.5,0.2,Iris-setosa
9,4.4,2.9,1.4,0.2,Iris-setosa
10,4.9,3.1,1.5,0.1,Iris-setosa


<h2>Node Class</h2>

In [34]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):

        #for decision node:
        self.feature_index=feature_index
        self.threshold=threshold
        self.left=left
        self.right=right
        self.info_gain=info_gain

        #for leaf node (majority class/label of leaf node)
        self.value=value
         

In [49]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):

        # initialize root of tree
        self.root=None

        # Stopping conditions (to avoid overfitting)
        self.min_samples_split=min_samples_split
        self.max_depth=max_depth

    def build_tree(self,df, curr_depth=0):

        X=df[:,:-1]
        Y=df[:,-1]
        num_of_samples, num_of_features= np.shape(X)

        # split until stopping conditions are met
        if num_of_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split=self.get_best_split(df,num_of_samples,num_of_features)
            # check if information gain (IG) is +ve
            if best_split['info_gain']>0:
                #recur left
                left_subTree= self.build_tree(best_split['df_left'], curr_depth+1)
                #recure right
                right_subTree=self.build_tree(best_split['df_right'],curr_depth+1)
                # return decision node
                return Node(best_split['feature_index'], best_split['threshold'],left_subTree, right_subTree,best_split['info_gain'])

        # compute leaf node
        leaf_value=self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)

    def get_best_split(self,df,num_of_samples, num_of_features): 
        # dictionary stores best split
        best_split={}
        max_info_gain=-float("inf")

        # loop over all features
        for feature_index in range(num_of_features):
            feature_values=df[:, feature_index]
            possible_thresholds=np.unique(feature_values)
            # loop over all the feature values present in data
            for threshold in possible_thresholds:
                # get current split
                df_left,df_right= self.split(df,feature_index,threshold)
                # check if childs are not null
                if len(df_left)>0 and len(df_right)>0:
                    y,left_y,right_y=df[:,-1],df_left[:,-1],df_right[:,-1]
                    # compute Information Gain
                    curr_info_gain=self.information_gain(y,left_y,right_y,"gini")
                    # update best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split['feature_index']=feature_index
                        best_split['threshold']=threshold
                        best_split['df_left']=df_left
                        best_split['df_right']=df_right
                        best_split['info_gain']=curr_info_gain
                        max_info_gain=curr_info_gain

        # return best split
        return best_split

    def split(self,df,feature_index,threshold):
        df_left=np.array([row for row in df if row[feature_index]<=threshold])
        df_right=np.array([row for row in df if row[feature_index]>threshold])
        return df_left,df_right

    def information_gain(self,parent,l_child,r_child,mode="entropy"):
        weight_l=len(l_child) / len(parent)
        weight_r=len(r_child) / len(parent)
        if mode=="gini":
            gain=self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        else:
            gain=self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain

    def entropy(self,y):
        class_labels=np.unique(y)
        entropy=0
        for cls in class_labels:
            p_cls=len(y[y==cls])/len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy

    def gini_index(self,y):
        class_labels=np.unique(y)
        gini=0
        for cls in class_labels:
            p_cls= len(y[y==cls])/len(y)
            gini += p_cls**2
        return 1-gini

    def calculate_leaf_value(self,Y):
        Y=list(Y)
        return max(Y,key=Y.count)

    def print_tree(self,tree=None,indent=" "):
        if not tree:
            tree=self.root
        if tree.value is not None:
            print(tree.value)
        else:
            print("X"+str(tree.feature_index), "<=",tree.threshold,"?",tree.info_gain)
            print("%sleft:"%(indent),end=" ")
            self.print_tree(tree.left, indent+indent)
            print("%sright:"%(indent),end=" ")
            self.print_tree(tree.right, indent+indent)

    def fit(self,X,Y):
    # function to train the tree:
            dataset=np.c_[X,Y]
            self.root=self.build_tree(dataset)

    def predict(self,X):
        # function to predict new dataset
            predictions=[self.make_prediction(x,self.root) for x in X]
            return predictions
            
    def make_prediction(self,x,tree):
        # function to predict a single data point
        if tree.value!=None :
            return tree.value
        feature_val=x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
            

            

<h2>Train-Test Split</h2>

In [45]:
X= data.iloc[:,:-1].values
Y=data.iloc[:,-1].values.reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X,Y, test_size=0.2, random_state=41)

<h2>Fit the model</h2>

In [57]:
classifier=DecisionTreeClassifier(min_samples_split=3, max_depth=3)
classifier.fit(X_train,Y_train)
classifier.print_tree()

X2 <= 1.9 ? 0.33741385372714494
 left: Iris-setosa
 right: X3 <= 1.5 ? 0.427106638180289
  left: X2 <= 4.9 ? 0.05124653739612173
    left: Iris-versicolor
    right: Iris-virginica
  right: X2 <= 5.0 ? 0.019631171921475288
    left: X1 <= 2.8 ? 0.20833333333333334
        left: Iris-virginica
        right: Iris-versicolor
    right: Iris-virginica


<h2>Test the model</h2>

In [58]:
Y_pred=classifier.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,Y_pred)

0.9333333333333333