In [28]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

def gini_impurity (value_counts):
    n = value_counts.sum()
    p_sum = 0
    for key in value_counts.keys():
        p_sum = p_sum  +  (value_counts[key] / n ) * (value_counts[key] / n ) 
    gini = 1 - p_sum
    return gini

In [130]:
class leaf_node:
    def __init__(self, data, labels, gini):
        self.samples = self.set_samples(data)
        self.value = self.set_value(labels)
        self.gini = gini      
        print("leaf_node_created", self.gini, self.samples)
    def set_samples(self, data):
        return len(data)
    def set_value(self, labels):
        ls = labels.unique()
        ls = sorted(ls)
        values = []
        for i in ls:
            temp = len(labels[labels == i])
            values.append(temp)
        return values
class decision_node:
    def __init__(self, data, labels,gini_min = 0):

        self.samples = self.set_samples(data)
        self.value = self.set_value(labels)
        self.gini = self.set_gini(labels)
        self.type = -1
        self.colm, self.split_val = self.generate_split(data, labels)
        print("decision_node created",self.gini)
        rhs_data = data[data[self.colm] < self.split_val]

        rhs_label = labels[data[self.colm] < self.split_val]
        rhs_gini =  gini_impurity(rhs_label.value_counts())
        if (rhs_gini <= gini_min):
            self.right = leaf_node(rhs_data, rhs_label, rhs_gini)
        else:
            self.right = decision_node(rhs_data, rhs_label)
            
        lhs_data = data[data[self.colm] >= self.split_val]

        lhs_label = labels[data[self.colm] >= self.split_val]
        lhs_gini =  gini_impurity(lhs_label.value_counts())
        if (lhs_gini <= gini_min):
            self.left = leaf_node(lhs_data, lhs_label, lhs_gini)
        else:
            self.left = decision_node(lhs_data, lhs_label)
            
    def set_samples(self, data):
        return len(data)
    def set_value(self, labels):
        ls = labels.unique()
        ls = sorted(ls)
        values = []
        for i in ls:
            temp = len(labels[labels == i])
            values.append(temp)
        return values
    def set_gini(self,labels):
        return gini_impurity(labels.value_counts())
    def find_split(self, column, labels):
        if column.dtype != "float64":
            rhs = labels[column < 0.5]
            lhs = labels[column >= 0.5]
            min_gini = gini_impurity(rhs.value_counts()) + gini_impurity(lhs.value_counts())
            return 0.5, min_gini
        else:
            min_gini = 2
            split_values = np.arange(column.min(),column.max(), (column.max() - column.min()) / 50)
            tmp = -1
            for i in split_values[15:35]:
                rhs = labels[column < i]
                lhs = labels[column >= i]
                tmp_gini = gini_impurity(rhs.value_counts()) + gini_impurity(lhs.value_counts())
                if tmp_gini <= min_gini:
                    min_gini = tmp_gini
                    tmp = i
            return tmp, min_gini
            
    def generate_split(self, data, labels):
        running_min = 2
        value = None
        clm = -1
        column = self.find_split(data[15], labels)
        for i in data.columns:
            val, tmp = self.find_split(data[i], labels)
            if tmp < running_min:
                clm = i
                value = val 
                running_min = tmp
                
        return clm, value

In [127]:
#Reading the training data
df_train = pd.read_csv("HW1_data/ann-train.data",sep=' ',header=None)

df_train.drop(columns=df_train.columns[-2:],inplace=True)
df_train_data, df_train_labels = df_train.drop(columns=df_train.columns[-1:]), df_train[21]
#Reading the test data
df_test = pd.read_csv("HW1_data/ann-test.data",sep=' ',header=None)

df_test.drop(columns=df_test.columns[-2:],inplace=True)
df_test_data, df_test_labels = df_test.drop(columns=df_test.columns[-1:]), df_test[21]

#Normalizing datasets
df_train_data_n = df_train_data.copy()
df_test_data_n = df_test_data.copy()
for column in df_train_data.columns:
    if df_train_data[column].dtype == 'float64':
        mean = df_train_data[column].mean()
        std = df_train_data[column].std()
        
        df_train_data_n[column] = (df_train_data_n[column] - mean) / std
        df_test_data_n[column] = (df_test_data_n[column] - mean) / std

In [128]:
X_train, X_val, y_train, y_val = train_test_split(df_train_data, df_train_labels, test_size=0.4, random_state=55)
X_train_n, X_val_n, y_train_n, y_val_n = train_test_split(df_train_data_n, df_train_labels, test_size=0.4, random_state=55)

In [131]:
dn = decision_node(X_train, y_train)
dn.colm

decision_node created 0.14144973540220984
decision_node created 0.13401237074244732
decision_node created 0.1340683158289928
decision_node created 0.13418034567901238
decision_node created 0.13451755682016764
decision_node created 0.13525400307512347
decision_node created 0.1364028097659804
decision_node created 0.1378664360828996
decision_node created 0.13936167228524843
decision_node created 0.1409513571620532
decision_node created 0.14568000111118407
decision_node created 0.14771859999351722
leaf_node_created 0.0 11
decision_node created 0.14847386414614128
decision_node created 0.16267884814049594
decision_node created 0.1456650774213364
decision_node created 0.15013756782767462
decision_node created 0.15511958161025075
decision_node created 0.16174669724815094
decision_node created 0.18824205862946553
decision_node created 0.19235882069743548
decision_node created 0.23631423283495423
leaf_node_created 0.0 13
decision_node created 0.20639875790760276
decision_node created 0.2070898

decision_node created 0.10072430964237222
leaf_node_created 0.0 8
decision_node created 0.10951865873445099
leaf_node_created 0.0 81
leaf_node_created 0.0 5
leaf_node_created 0.0 4
leaf_node_created 0.0 4
leaf_node_created 0.0 3
leaf_node_created 0.0 3
leaf_node_created 0.0 3
leaf_node_created 0.0 2
leaf_node_created 0.0 2
leaf_node_created 0.0 1
decision_node created 0.07328798185941032
leaf_node_created 0.0 101
leaf_node_created 0.0 4
leaf_node_created 0.0 18
decision_node created 0.039666278745758676
leaf_node_created 0.0 5
leaf_node_created 0.0 242
decision_node created 0.045429962141698255
decision_node created 0.046485260770975145
decision_node created 0.04875000000000007
decision_node created 0.05124653739612173
decision_node created 0.054012345679012475
decision_node created 0.05709342560553643
decision_node created 0.060546875
decision_node created 0.06658739595719365
decision_node created 0.07679999999999998
decision_node created 0.08317580340264641
leaf_node_created 0.0 22
l

16

In [53]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2263 entries, 91 to 461
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       2263 non-null   float64
 1   1       2263 non-null   int64  
 2   2       2263 non-null   int64  
 3   3       2263 non-null   int64  
 4   4       2263 non-null   int64  
 5   5       2263 non-null   int64  
 6   6       2263 non-null   int64  
 7   7       2263 non-null   int64  
 8   8       2263 non-null   int64  
 9   9       2263 non-null   int64  
 10  10      2263 non-null   int64  
 11  11      2263 non-null   int64  
 12  12      2263 non-null   int64  
 13  13      2263 non-null   int64  
 14  14      2263 non-null   int64  
 15  15      2263 non-null   int64  
 16  16      2263 non-null   float64
 17  17      2263 non-null   float64
 18  18      2263 non-null   float64
 19  19      2263 non-null   float64
 20  20      2263 non-null   float64
dtypes: float64(6), int64(15)
memory usage