In [110]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import math

df_spam = pd.read_csv("spambase.csv", header=None)

df_iris = pd.read_csv("iris.csv", header=None)
mapping = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
df_iris[df_iris.columns[-1]] = df_iris[df_iris.columns[-1]].map(mapping)
features_iris = df_iris.iloc[:, :-1].values.T  # Transpose to match your original structure
target_iris = df_iris.iloc[:, -1].values

features_spam = df_spam.iloc[:, :-1].values.T  # Transpose to match your original structure
target_spam = df_spam.iloc[:, -1].values


#cols is [4,150] and target is [150]
#X_train, X_test, y_train, y_test = train_test_split(features.T, target, test_size=0.2, random_state=42)
#X_train, X_test = X_train.T, X_test.T

In [111]:
class node:
    def __init__(self, target, feature,threshold, feature_index) -> None:
        self.left = None
        self.right = None
        self.target = target
        self.threshold = threshold
        self.feature = feature
        self.feature_index = feature_index


    def set_values(self, num_class, min, total):
        self.values = [0 for _ in range(num_class)]
        unique, count = np.unique(self.target, return_counts=True)
        for i in range(len(unique)):
            self.values[unique[i]] = count[i]
        if len(unique) == 1 or len(self.target) <= total*(min/100):
            self.is_leaf = True
        else:
            self.is_leaf = False

class tree:
    def __init__(self, num_class, min, total_points) -> None:
        self.root = None
        self.num_class = num_class
        self.min = min
        self.total_points = total_points
        self.done = []
    
    def calc_entropy(self,y):
        unique,unique_count = np.unique(y, return_counts=True)
        sum = 0
        for i in range(len(unique)):
            p_i = unique_count[i]/len(y)
            sum += -1 * p_i * np.log2(p_i) 
        return sum

    def calc_IG(self,feature, threshold,y): 
        left = np.where(feature<=threshold)
        right = np.where(feature>threshold)
        IG = self.calc_entropy(y) - \
        (self.calc_entropy(y[left]) * (len(y[left])/len(y)) +
         self.calc_entropy(y[right]) * (len(y[right])/len(y))        
        )
        return IG

    def best_threshold(self, feature, y):
        unique = np.unique(feature)
        IG_array = np.zeros(unique.shape)
        for i, threshold in enumerate(unique):
            IG_array[i] = self.calc_IG(feature, threshold, y)

        best_i = np.argmax(IG_array)
        best_IG = IG_array[best_i]
        best_threshold = unique[best_i]
        return best_threshold, best_IG

    def best_feature(self, cols, y):
        feature_best_threshold = [0 for i in range(len(cols))]
        feature_best_IG = [0 for i in range(len(cols))]
        for i, feature in enumerate(cols):
            feature_best_threshold[i],feature_best_IG[i] = self.best_threshold(feature,y)
        best_i = np.argmax(feature_best_IG)        
        feature = cols[best_i]
        return feature, feature_best_threshold[best_i], best_i

    def find_root(self, features, y):
        feature, best_thresh, feature_index = self.best_feature(features,y)
        temp_node = node(y, feature, best_thresh, feature_index)
        temp_node.set_values(self.num_class, self.min, self.total_points)
        if self.root == None:
            self.root = temp_node 

        if temp_node.is_leaf:
            return temp_node

        left = np.where(feature<=best_thresh)
        right = np.where(feature>best_thresh)
        temp_node.left = self.find_root(features[:, left[0]], y[left])
        temp_node.right= self.find_root(features[:, right[0]], y[right])
        return temp_node

def gothrough(root, value):
    if root.is_leaf != True:
        if value[root.feature_index] <= root.threshold:
            return gothrough(root.left, value)
        else:
            return gothrough(root.right, value)
    else:
        return np.argmax(root.values)


def test_train(features, target):
    pairs = {5*x:[] for x in range(1,6)}
    for key in pairs.keys():
        kf = KFold(n_splits=10)
        values = []
        for train,test in kf.split(target):
            X_train = features[:, train]
            y_train = target[train]
            a = tree(len(X_train), key, len(y_train))
            a.find_root(X_train, y_train)

            X_test = features[:, test]
            y_test = target[test]
            predict = []
            for i in range(len(y_test)):
                x_0 = X_test[:, i]
                predict.append(gothrough(a.root, x_0))
            values.append(accuracy_score(y_test, predict))
        pairs[key] = values
    return pairs

In [113]:
pairs = test_train(features_iris, target_iris)

print("## Table for IRIS dataset:")
print("| n_min | Avg Accuracy | Standard Deviation |                               Accuracy")
print("|:-----:|:-----------:|:------------------:--------------------------------------------------------------------------|")

for i in sorted(pairs.keys()):  # Ensure the keys are sorted if order matters
    avg_accuracy = np.average(pairs[i])
    std_dev = np.std(pairs[i]).round(2)
    # Using f-string with formatting for alignment and precision
    print(f"| {i:^5} | {avg_accuracy:^13.2f} | {std_dev:^18} | {np.round(pairs[key],4)}")




## Table for IRIS dataset:
| n_min | Avg Accuracy | Standard Deviation |                               Accuracy
|:-----:|:-----------:|:------------------:--------------------------------------------------------------------------|
|   5   |     0.93      |        0.08        | [1.     1.     1.     0.9333 0.9333 0.8    1.     0.8667 0.8    1.    ]
|  10   |     0.93      |        0.08        | [1.     1.     1.     0.9333 0.9333 0.8    1.     0.8667 0.8    1.    ]
|  15   |     0.93      |        0.08        | [1.     1.     1.     0.9333 0.9333 0.8    1.     0.8667 0.8    1.    ]
|  20   |     0.93      |        0.08        | [1.     1.     1.     0.9333 0.9333 0.8    1.     0.8667 0.8    1.    ]
|  25   |     0.93      |        0.08        | [1.     1.     1.     0.9333 0.9333 0.8    1.     0.8667 0.8    1.    ]


In [114]:
pairs = test_train(features_spam, target_spam)

print("## Table for IRIS dataset:")
print("| n_min | Avg Accuracy | Standard Deviation |                               Accuracy")
print("|:-----:|:-----------:|:------------------:--------------------------------------------------------------------------|")

for i in sorted(pairs.keys()):  # Ensure the keys are sorted if order matters
    avg_accuracy = np.average(pairs[i])
    std_dev = np.std(pairs[i]).round(2)
    # Using f-string with formatting for alignment and precision
    print(f"| {i:^5} | {avg_accuracy:^13.2f} | {std_dev:^18} | {np.round(pairs[key],4)}")




## Table for IRIS dataset:
| n_min | Avg Accuracy | Standard Deviation |                               Accuracy
|:-----:|:-----------:|:------------------:--------------------------------------------------------------------------|
|   5   |     0.87      |        0.06        | [0.6941 0.8326 0.8478 0.6109 0.8739 0.7543 0.8804 0.9217 0.7826 0.637 ]
|  10   |     0.87      |        0.07        | [0.6941 0.8326 0.8478 0.6109 0.8739 0.7543 0.8804 0.9217 0.7826 0.637 ]
|  15   |     0.84      |        0.07        | [0.6941 0.8326 0.8478 0.6109 0.8739 0.7543 0.8804 0.9217 0.7826 0.637 ]
|  20   |     0.82      |        0.1         | [0.6941 0.8326 0.8478 0.6109 0.8739 0.7543 0.8804 0.9217 0.7826 0.637 ]
|  25   |     0.78      |        0.1         | [0.6941 0.8326 0.8478 0.6109 0.8739 0.7543 0.8804 0.9217 0.7826 0.637 ]
