## **Decision Tree** 
### **Author:** Hansal Shah 

# **Loading the dataset and importing the required libraries**


---


In [None]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

# **Building the Decision Tree Regressor from scratch**

---

**Defining the nodes of a tree**

In [None]:
class Node():
  def __init__(self, feature_index=None, threshold=None, 
               left_node=None, right_node=None, info_gain=None, value=None):

    #The properties it will have if it is a decision node
    self.feature_index = feature_index
    self.threshold=threshold
    self.left_node = left_node
    self.right_node = right_node
    self.info_gain = info_gain

    #The properties that it will have if it is a leaf node
    self.value=value

**Defining the Decision Tree Regressor**

In [None]:
class DecisionTreeRegressor():
  def __init__(self, min_samples_split=2, max_depth=2, mode="entropy"):
    #Defining the root of the tree
    self.root = None
    #Defining the required parameters of the tree
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth
    self.mode = mode

  def fit(self, X, y):
    dataset = np.concatenate((X,y),axis=1)
    self.root = self.build_tree(dataset)

  def predict(self, X):
    return [self.sample_prediction(x, self.root) for x in X]

  def sample_prediction(self, x, tree):
    if tree.value!=None: return tree.value
    feature_val = x[tree.feature_index]
    if feature_val<=tree.threshold:
      return self.sample_prediction(x, tree.left_node)
    else:
      return self.sample_prediction(x, tree.right_node)

  def build_tree(self, dataset, current_depth=0):
    #Splitting the dataset into target and non-target features
    X, y = dataset[:,:-1], dataset[:,-1]
    samples, features = np.shape(X)

    #Chcking whether the leaf node condition has been reached or not
    if samples>=self.min_samples_split and current_depth<=self.max_depth:
      
      #Calculating the best split for the current dataset
      best_split = self.get_best_split(dataset, samples, features)
      #Checking for a positive information gain 
      if best_split['info_gain'] > 0:
        #Creating a left and right child nodes for the current node
        left_subtree=self.build_tree(best_split['left_subtree'],current_depth+1)
        right_subtree=self.build_tree(best_split['right_subtree'],current_depth+1)
        return Node(best_split['feature_index'], best_split['threshold'],
                    left_subtree, right_subtree,
                    best_split['info_gain'])
      
    leaf_value = self.calc_leaf_value(y)
    return Node(value = leaf_value)

  def calc_leaf_value(self, y):
    y = list(y)
    return max(y,key=y.count)

  def get_best_split(self, dataset, samples, features):
    #Defining the best split and the maximum info gain
    best_split = {}
    max_info_gain = -float("inf")

    #Searching through all features and thresholds for maximum info gain
    for feature_index in range(features):
      values = dataset[:,feature_index]
      possible_thresholds = np.unique(values)

      for threshold in possible_thresholds:
        #Splitting the dataset at this point
        dataset_left,dataset_right=self.split(dataset, feature_index, threshold)
        if len(dataset_left)>0 and len(dataset_right)>0:
          #Calculating the information gain for this split
          y,y_left,y_right=dataset[:,-1],dataset_left[:,-1],dataset_right[:,-1]
          current_info_gain = self.information_gain(y, y_left, y_right)
         
          #Updating the gain if required
          if current_info_gain>max_info_gain:
            best_split['feature_index'] = feature_index
            best_split['threshold'] = threshold
            best_split['left_subtree'] = dataset_left 
            best_split['right_subtree'] = dataset_right 
            best_split['info_gain'] = current_info_gain 
            max_info_gain = current_info_gain

    return best_split    
        
  def information_gain(self, parent, left_child, right_child):
    weight_left = len(left_child)/len(parent)
    weight_right = len(right_child)/len(parent)

    if self.mode=="gini":
      return (self.gini_index(parent) - 
              ((weight_left*self.gini_index(left_child))+
               (weight_right*self.gini_index(right_child))))
    else:
      return (self.entropy(parent) - 
              ((weight_left*self.entropy(left_child))+
               (weight_right*self.entropy(right_child))))
    
  def gini_index(self, y):
    types = np.unique(y)
    gini = 0
    for type in types:
      prob_type = len(y[y==type])/len(y)
      gini+= prob_type**2
    return (1-gini)

  def entropy(self, y):
    types = np.unique(y)
    entropy = 0
    for type in types:
      prob_type = len(y[y==type])/len(y)
      entropy += -prob_type*np.log2(prob_type)
    return entropy

  def split(self, dataset, feature_index, threshold):
    left = np.array([row for row in dataset if row[feature_index]<=threshold])
    right = np.array([row for row in dataset if row[feature_index]>threshold])
    return left, right 

# **Making predictions on Iris dataset**

---



**Loading the dataset**

In [None]:
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
target = pd.DataFrame(iris.target)

In [None]:
data.shape

(150, 4)

**Splitting it into traiing and testing datasets**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=9)

In [None]:
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
112,6.8,3.0,5.5,2.1
138,6.0,3.0,4.8,1.8
23,5.1,3.3,1.7,0.5
67,5.8,2.7,4.1,1.0
79,5.7,2.6,3.5,1.0


In [None]:
X_test.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
135,7.7,3.0,6.1,2.3
90,5.5,2.6,4.4,1.2
145,6.7,3.0,5.2,2.3
147,6.5,3.0,5.2,2.0
60,5.0,2.0,3.5,1.0


**Using the Decision Tree Regressor made from scratch to fit the training data and to make predictions on the test data**

In [None]:
model = DecisionTreeRegressor(min_samples_split=5, max_depth=5)
model.fit(X_train, y_train)
predictions = model.predict(np.array(X_test))

In [None]:
print(accuracy_score(y_test, predictions))

1.0
