In [160]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from metrics import *
from tree.base import DecisionTree,Node

from calendar import c
from typing import Literal

np.random.seed(42)

In [161]:
def accuracy(y_hat: pd.Series, y: pd.Series):
    assert y_hat.size == y.size

    correct_predictions = sum(a == p for a, p in zip(y, y_hat))
    total_predictions = len(y)
    
    accuracy = correct_predictions / total_predictions
    return accuracy
    

def precision(y_hat: pd.Series, y: pd.Series, cls: Union[int, str]):
    true_positives=y_hat[y == cls].value_counts().get(cls,0)
    total_prediction=len([y_hat == cls])

    print(y_hat)
    print("true_postivi" , true_positives)

    if total_prediction == 0:
        return 0.0  # Avoid division by zero

    precision = true_positives / total_prediction
    return precision


def recall(y_hat: pd.Series, y: pd.Series, cls: Union[int, str]):
    true_positives=y_hat[y == cls].value_counts().get(cls,0)
    total_actual=len([y == cls])

    if total_actual == 0:
        return 0.0
    recall = true_positives / total_actual
    return recall


def rmse(y_hat: pd.Series, y: pd.Series):
    assert ValueError("Input lists must have the same length.")

    squared_errors = (y - y_hat) ** 2 
    mean_squared_error = squared_errors.mean()
    rmse = np.sqrt(mean_squared_error)
    return rmse


def mae(y_hat: pd.Series, y: pd.Series):
    assert ValueError("Input lists must have the same length.")

    squared_errors = (y - y_hat) ** 2 
    return squared_errors.mean()


In [162]:
def check_ifreal(y: pd.Series):
    if y.dtype.name == "category":
        return False
    return True

def entropy(Y: pd.Series):
    unique_class, class_counts = np.unique(Y, return_counts=True)
    probability = class_counts/len(Y)
    return -np.sum(probability * np.log2(probability))

def gini_index(Y: pd.Series):
    unique_class, class_counts = np.unique(Y, return_counts=True)
    probability = class_counts/len(Y)
    return 1-np.sum(probability ** 2)

def MSE(target : pd.Series,mean):
    squared_diff = (target - mean).apply(lambda x: x**2)
    mse = squared_diff.mean()
    return mse


#returning the minimum loss from the splits of the feature
def information_gain(attr: pd.DataFrame, Y: pd.Series, feature_idx):
    loss=float('inf')
    final_thresholder=0
    threshold=0
    for i in range(2,X.shape[0]):
        threshold=(X[feature_idx][i-1] + X[feature_idx][i])/2
        
        r1_x=X[X[feature_idx] < threshold]
        r1_y=Y[X[feature_idx] < threshold]

        r2_x=X[X[feature_idx] > threshold]
        r2_y=Y[X[feature_idx] > threshold]

        # print("hello")
        # print(Y[X[feature_idx] < threshold])

        new_loss=MSE(r1_y,r1_y.mean())/len(r1_x) + \
                MSE(r2_y, r2_y.mean()) /len(r2_x)

        print("loss" , new_loss)
        if new_loss < loss:
            loss=new_loss
            final_thresholder=threshold
        # break
    return  [final_thresholder,loss]


def opt_split_attribute(X: pd.DataFrame, Y: pd.Series, criterion="information_gain"):
    """
    Function to find the optimal attribute to split about.
    If needed you can split this function into 2, one for discrete and one for real valued features.
    You can also change the parameters of this function according to your implementation.

    features: pd.Series is a list of all the attributes we have to split upon

    return: attribute to split upon
    """

    # According to wheather the features are real or discrete valued and the criterion, find the attribute
    #  from the features series with the maximum information gain
    #  (entropy or varinace based on the type of output) or minimum gini index (discrete output).

    if criterion == "information_gain":
        max_info_gain=0
        best_feature=0
        threshold=0
        for column_name in X.columns:
            X.sort_values(by=column_name)
            new_info_gain=information_gain(X,Y,column_name)
            if new_info_gain[1] < max_info_gain:
                max_info_gain=new_info_gain
                best_feature=column_name
                threshold=new_info_gain[0]

    return [best_feature,threshold]


def split_data(X: pd.DataFrame, y: pd.Series, attribute, value):
    """
    Funtion to split the data according to an attribute.
    If needed you can split this function into 2, one for discrete and one for real valued features.
    You can also change the parameters of this function according to your implementation.

    attribute: attribute/feature to split upon
    value: value of that attribute to split upon

    return: splitted data(Input and output)
    """


    # Split the data based on a particular value of a particular attribute. 
    # You may use masking as a tool to split the data.

    

In [163]:
class Node:
    def __init__(self, data=None, feature_value=None,target=None,child=None,result=None,threshold=0):
        self.data = data  # data corresponding to the node [matrix]
        self.target = target  # y data
        self.children = child  # child names & objects
        self.feature = feature_value  # value at node
        self.result = result
        self.threshold=threshold

    def plot(self, level=0):
        indent = "  " * level
        print(f"{indent}|- {self.feature}: {self.result}")
        if self.children is not None:
            for child in self.children:
                self.children[child].plot(level + 1)

class DecisionTree:
    # criterion won't be used for regression
    criterion: Literal["information_gain", "gini_index"]
    max_depth: int  # The maximum depth the tree can grow to
    root = None

    def __init__(self, criterion, max_depth=5):
        self.criterion = criterion
        self.max_depth = max_depth

    # def fit_DI_DO(self, data_frame: pd.DataFrame, target: pd.Series, depth=0,criterion="information_gain"):
    #     return self.id3(data_frame,target,depth)

    def id3(self,data_frame: pd.DataFrame, target: pd.Series, depth=0,criterion="information_gain"):
            # If all instances have the same target value, create a leaf node

            #no split of data set is possible when we have 2 or lesser samples
            if target.size <= 2:
                mean_value = target.mean()
                return Node(result=mean_value)

            # If there are no more features to split on, create a leaf node with the majority class
            if len(list(data_frame.columns)) == 0:
                mean_value = target.mean()
                return Node(result=mean_value)

            # If maximum depth is reached, create a leaf node with the majority class
            if self.max_depth is not None and depth == self.max_depth:
                mean_value = target.mean()
                return Node(result=mean_value)

            # Choose the best feature to split on based on information gain considering only single level in tree
            best_attribute=opt_split_attribute(data_frame,target,criterion=criterion)
            root=Node(data=data_frame,target=target,feature_value=best_attribute[0],child={},threshold=best_attribute[1])

            print("donee ")
            
            r1_x=data_frame[data_frame[best_attribute[0]] < best_attribute[1]]
            r1_y=target[data_frame[best_attribute[0]] < best_attribute[1]]

            r2_x=data_frame[data_frame[best_attribute[0]] >= best_attribute[1]]
            r2_y=target[data_frame[best_attribute[0]] >= best_attribute[1]]

            if r1_y.size > 0 :
                root.children[0]=self.id3(r1_x, r1_x,depth=depth+1)

            if r2_y.size > 0 :
                root.children[1]=self.id3(r2_x, r2_y,depth=depth+1)

            return root

    def predict_multiway(self,tree :Node, sample):
        # Reached a leaf node which contains the result value, return the result
        if tree.result is not None:
            return tree.result

        # Not a leaf node, continue traversing the tree
        feature_value = sample[tree.feature]

        if feature_value < tree.threshold:
            child_node = tree.children[0]
        else:
            child_node = tree.children[1]
        return self.predict_multiway(child_node, sample)


    def fit(self, X: pd.DataFrame, y: pd.Series):
        # if check_ifreal(y):
    #     return self.id3(X,y,depth=0,criterion="MSE")     
    # else:
    #      return self.id3(X,y,depth=0,criterion=self.criterion)  
        return self.id3(X,y,depth=0,criterion=self.criterion)   

    def predict(self, tree,X):
        return self.predict_multiway(tree,X)

    def plot(self):
        """
        Function to plot the tree

        Output Example:
        ?(X1 > 4)
            Y: ?(X2 > 7)
                Y: Class A
                N: Class B
            N: Class C
        Where Y => Yes and N => No
        """
        if self.root is not None:
            self.root.plot()

In [164]:
N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
y = pd.Series(np.random.randn(N))


# for criteria in ["information_gain", "gini_index"]:
tree = DecisionTree(criterion="information_gain")  # Split based on Inf. Gain
root=tree.fit(X, y)
# y_hat = root.predict(X)
# tree.plot()
# print("Criteria :", "information_gain")
# print("RMSE: ", rmse(y_hat, y))
# print("MAE: ", mae(y_hat, y))


loss 0.12445271335215562
loss 0.12501209695213814
loss 0.14418203231899676
loss 0.10293405389244553
loss 0.12445271335215562
loss 0.08836298040712562
loss 0.12445271335215562
loss 0.1131176683324782
loss 0.1131176683324782
loss 0.1652900748199381
loss 0.11567458587902008
loss 0.14418203231899676
loss 0.1519385908516112
loss 0.1652900748199381
loss 0.11881143078107803
loss 0.12445271335215562
loss 0.1131176683324782
loss 0.09454284184488979
loss 0.07286213791449493
loss 0.12501209695213814
loss 0.08836298040712562
loss 0.08836298040712562
loss 0.1652900748199381
loss 0.0799587439622042
loss 0.1255466669200042
loss 0.1131176683324782
loss 0.1519385908516112
loss 0.1652900748199381
loss 0.17616604593053273
loss 0.1742646505443559
loss 0.1742646505443559
loss 0.1742646505443559
loss 0.14662203506784777
loss 0.20991259267443535
loss 0.12836417409296108
loss 0.12209711992657662
loss 0.14553139436703202
loss 0.1742646505443559
loss 0.1742646505443559
loss 0.11664765468416241
loss 0.1761660459

  r1_y=Y[X[feature_idx] < threshold]
  r2_y=Y[X[feature_idx] > threshold]


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().