In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
from collections import deque
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

In [6]:
def entropy(data): #S = a list of + , -
    value_counts = {}
    for value in data:
        if value in value_counts:
            value_counts[value] += 1
        else:
            value_counts[value] = 1

    probabilities = [count / len(data) for count in value_counts.values()]

    entropy = 0
    for probability in probabilities:
        entropy -= probability * math.log2(probability)

    return entropy

def maxGain(df, gainState):
    if gainState == "Ent":
        originalEntropy = entropy(df.iloc[:,-1])
        gains = {}
        features = df.columns[:-1].tolist()
        for feature in features:
            splitEntropy = 0
            for value in df[feature].unique():
                subset = df[df[feature] == value].iloc[:,-1]
                splitEntropy += len(subset) / len(df) * entropy(subset)
            gains[feature] = originalEntropy - splitEntropy

        max_gain_feature = max(gains, key=gains.get)
        max_gain = gains[max_gain_feature]

        return max_gain_feature, max_gain
    
    elif gainState == "Gini":
        originalGini = gini_index(df.iloc[:,-1])
        gains = {}
        features = df.columns[:-1].tolist()
        for feature in features:
            splitGini = 0
            for value in df[feature].unique():
                subset = df[df[feature] == value].iloc[:,-1]
                splitGini += len(subset) / len(df) * gini_index(subset)
            gains[feature] = originalGini - splitGini

        max_gain_feature = max(gains, key=gains.get)
        max_gain = gains[max_gain_feature]

        return max_gain_feature, max_gain

    else:
        return None

def gini_index(series):
    class_counts = series.value_counts()
    class_probabilities = class_counts / len(series)
    gini = 1 - np.sum(class_probabilities ** 2)
    return gini

In [7]:
class Node:
    staticId = 0

    def __init__(self, splitFeature = None, children = None, leafValue = None, majorityValue = None):
        self.splitFeature = splitFeature
        self.children = children
        self.id = Node.staticId
        Node.staticId+=1
        self.leafValue = leafValue
        self.majorityValue = majorityValue


In [8]:
class DecisionTree:
    def __init__(self, maxDepth = 20, minSample = 2, threshold = 0.05, gainState = "Ent"):
        self.root = None
        self.maxDepth = maxDepth
        self.minSample = minSample
        self.threshold = threshold
        self.gainState = gainState
    
    def fit(self, data):
        self.root = self.build(data, 0)

    def build(self, data, currentDepth):
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]
        n, m = X.shape

        label_counts = data.iloc[:,-1].value_counts()
        max_occuring_value = label_counts.idxmax()

        if currentDepth < self.maxDepth and n > self.minSample:       
            children = []
            max_gain_feature, max_gain = maxGain(data, self.gainState)
            if max_gain>self.threshold:
                for value in data[max_gain_feature].unique():
                    newData = data[data[max_gain_feature]==value]
                    if newData.shape[0]<=0:
                        continue
                    child = self.build(newData, currentDepth+1)
                    children.append([value,child])
                return Node(max_gain_feature, children, majorityValue = max_occuring_value)
        
        #leafe node
        return Node(leafValue=max_occuring_value)
    
    def printTree(self):
        self.print(self.root, 0, None)
        
    def print(self, node, depth, edge):
        indent = '  ' * depth
        print(indent + f"ID: {node.id}, Feature: {node.splitFeature}, Edge: {edge}, Value: {node.leafValue}")
        if not node.children is None:
            for child in node.children:
                self.print(child[1], depth+1, child[0])
    


    def predict(self, data):
        result_list = data.apply(self.predRow, axis=1)
        return result_list
    
    def predRow(self, row):
        node = self.root
        while node.leafValue is None:
            flag = False
            for [value, child] in node.children:
                if value == row[node.splitFeature]:
                    node = child
                    flag = True
                    break
            if not flag:
                return node.majorityValue
        return node.leafValue

In [9]:
dataset = [
    {"Outlook": "Sunny", "Temperature": "Hot", "Humidity": "High", "PlayTennis": "No"},
    {"Outlook": "Sunny", "Temperature": "Hot", "Humidity": "High", "PlayTennis": "No"},
    {"Outlook": "Overcast", "Temperature": "Hot", "Humidity": "High", "PlayTennis": "Yes"},
    {"Outlook": "Rainy", "Temperature": "Mild", "Humidity": "High", "PlayTennis": "Yes"},
    {"Outlook": "Rainy", "Temperature": "Cool", "Humidity": "Normal", "PlayTennis": "Yes"},
    {"Outlook": "Rainy", "Temperature": "Cool", "Humidity": "Normal", "PlayTennis": "No"},
    {"Outlook": "Overcast", "Temperature": "Cool", "Humidity": "Normal", "PlayTennis": "Yes"},
    {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "High", "PlayTennis": "No"},
    {"Outlook": "Sunny", "Temperature": "Cool", "Humidity": "Normal", "PlayTennis": "Yes"},
    {"Outlook": "Rainy", "Temperature": "Mild", "Humidity": "Normal", "PlayTennis": "Yes"},
    {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "Normal", "PlayTennis": "Yes"},
    {"Outlook": "Overcast", "Temperature": "Mild", "Humidity": "High", "PlayTennis": "Yes"},
    {"Outlook": "Overcast", "Temperature": "Hot", "Humidity": "Normal", "PlayTennis": "Yes"},
    {"Outlook": "Rainy", "Temperature": "Mild", "Humidity": "High", "PlayTennis": "No"}
]

df = pd.DataFrame(dataset)
model = DecisionTree()
model.fit(df)

In [10]:
model.printTree()

ID: 5, Feature: Outlook, Edge: None, Value: None
  ID: 2, Feature: Humidity, Edge: Sunny, Value: None
    ID: 0, Feature: None, Edge: High, Value: No
    ID: 1, Feature: None, Edge: Normal, Value: Yes
  ID: 3, Feature: None, Edge: Overcast, Value: Yes
  ID: 4, Feature: None, Edge: Rainy, Value: Yes


In [113]:
test = [{"Outlook": "Rainy", "Temperature": "Cool", "Humidity": "Normal", "PlayTennis": "Yes"},
    {"Outlook": "Rainy", "Temperature": "Cool", "Humidity": "Normal", "PlayTennis": "No"},
    {"Outlook": "Overcast", "Temperature": "Cool", "Humidity": "Normal", "PlayTennis": "Yes"},
    {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "High", "PlayTennis": "No"},
    {"Outlook": "Sunny", "Temperature": "Cool", "Humidity": "Normal", "PlayTennis": "Yes"}]

testdf = pd.DataFrame(test)
testdfx = testdf.iloc[:,:-1]
testdfy = testdf.iloc[:,-1]
y_pred = model.predict(testdfx)

label_encoder = LabelEncoder()
# Fit label encoder and transform string labels to numerical values
y_true = label_encoder.fit_transform(testdfy)
y_pred = label_encoder.transform(y_pred)

mean_squared_error(y_true, y_pred)

0.2

In [114]:
df = pd.read_csv("preprocessedData.csv")
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,loan,day,month,duration,campaign,previous,y
0,"[30,45]",technician,single,secondary,underMean,yes,no,low,may,one,1,0,0
1,"[30,45]",admin,married,secondary,underMean,yes,yes,low,may,one,1,0,0
2,45<,services,married,unknown,aboveMean,yes,no,low,may,one,1,0,0
3,"[30,45]",unemployed,single,unknown,underMean,no,no,low,may,one,1,0,0
4,"[30,45]",admin,married,tertiary,underMean,yes,no,low,may,one,1,0,0


In [115]:
Train, Test = train_test_split(df, test_size=0.25, random_state=42)

In [116]:
model = DecisionTree(maxDepth = 5, minSample=100, gainState = "Ent")
model.fit(Train)

In [117]:
model.printTree()

ID: 39, Feature: duration, Edge: None, Value: None
  ID: 36, Feature: month, Edge: one, Value: None
    ID: 6, Feature: None, Edge: feb, Value: 0
    ID: 9, Feature: previous, Edge: jun, Value: None
      ID: 7, Feature: None, Edge: 0, Value: 0
      ID: 8, Feature: None, Edge: 1, Value: 0
    ID: 12, Feature: previous, Edge: aug, Value: None
      ID: 10, Feature: None, Edge: 0, Value: 0
      ID: 11, Feature: None, Edge: 1, Value: 0
    ID: 15, Feature: housing, Edge: apr, Value: None
      ID: 13, Feature: None, Edge: yes, Value: 0
      ID: 14, Feature: None, Edge: no, Value: 0
    ID: 16, Feature: None, Edge: may, Value: 0
    ID: 19, Feature: day, Edge: nov, Value: None
      ID: 17, Feature: None, Edge: high, Value: 0
      ID: 18, Feature: None, Edge: low, Value: 0
    ID: 20, Feature: None, Edge: jul, Value: 0
    ID: 23, Feature: day, Edge: jan, Value: None
      ID: 21, Feature: None, Edge: high, Value: 0
      ID: 22, Feature: None, Edge: low, Value: 0
    ID: 24, Feature: 

In [118]:
TestX, TestY = Test.iloc[:, :-1], Test.iloc[:,-1]
y_pred = model.predict(TestX)

In [119]:
print(accuracy_score(TestY, y_pred), precision_score(TestY, y_pred))

0.8879745155295992 0.5241691842900302


In [120]:
model2 = DecisionTree(maxDepth = 5, minSample=100, gainState = "Gini", threshold = 0.02)
model2.fit(Train)
y_pred = model2.predict(TestX)
print(accuracy_score(TestY, y_pred), precision_score(TestY, y_pred))

0.8915140253074949 0.5417633410672854
