In [3]:
import pandas as pd
from treelib import Tree
from sklearn.model_selection import train_test_split

train_set = pd.read_csv('training.csv')
train,vad = train_test_split(train_set,test_size=0.25,random_state=17)
truth_value = list(vad.iloc[:,-1].values)
x_vad = vad.iloc[:,:-1]
y_vad = list(vad.iloc[:,-1].values)
print(y_vad)


def Gini(x):
    sum = 0
    for i in x:
        sum += i ** 2
    return 1 - sum



def Feature_Decision(Gini_feature, Gini_labels):
    diff_gini_feature = {}
    for i in Gini_feature.keys():
        diff_gini_feature[i] = Gini_labels - Gini_feature[i]
    decide_feature = max(diff_gini_feature, key=diff_gini_feature.get)
    return decide_feature



def Feature_Labels_Gini(data):
    label_name = data.columns[-1]
    labels = data[label_name].value_counts()
    prob_label = [labels[0] / sum(labels), labels[1] / sum(labels)]
    Gini_labels = Gini(prob_label)

    Gini_feature = {}
    for i in data.columns[:-1]:
        sub_df1 = data[i].value_counts()
        gini = 0
        for j in sub_df1.index:
            prob_feature_labels = sub_df1[j] / sum(sub_df1)
            sub_df2 = data.iloc[:, -1][data[i] == j].value_counts()
            if len(sub_df2) < 2:
                gini += 0
            else:
                prob_feature = [sub_df2[0] / sum(sub_df2), sub_df2[1] / sum(sub_df2)]
                gini += prob_feature_labels * Gini(prob_feature)
        Gini_feature[i] = gini
    return Gini_labels, Gini_feature


tree = Tree()
feature_order = []
tree.create_node(tag='root', identifier='root', data=0)


def Construct_Tree(data, parent):
    if len(data.iloc[:, -1].unique()) == 1:
        tree.create_node(tag=data.iloc[:, -1].iloc[0], parent=parent, identifier=parent + '_result',
                         data=data.iloc[:, -1].iloc[0])
        return
    elif len(data.columns) == 2:
        for i in data.iloc[:, 0].value_counts().index:
            tree.create_node(tag=data.columns[0] + '_' + i, parent=parent,
                             identifier=parent + data.columns[0] + '_' + i)
            dic = dict(data[data.iloc[:, 0] == i].iloc[:, -1].value_counts())
            tree.create_node(tag=max(dic, key=dic.get),  parent=parent + data.columns[0] + '_' + i,
                             identifier=parent + data.columns[0] + '_' + i + '_result',
                             data=max(dic, key=dic.get))
        return
    gini_labels, gini_feature = Feature_Labels_Gini(data)
    print([gini_labels,gini_feature])
    decided_feature = Feature_Decision(gini_feature, gini_labels)
    if gini_feature[decided_feature] >= 0.4:
        dic1 = dict(data.iloc[:-1].value_counts())
        tree.create_node(tag = max(dic1, key = dic1.get),parent = parent + '_' + decided_feature,
                         identifier =parent + '_' + decided_feature+ '_result',data = max(dic1, key = dic1.get))
        return
    for k in data[decided_feature].value_counts().index:
        if decided_feature not in feature_order:
            feature_order.append(decided_feature)
        new_data = data[data[decided_feature] == k].drop(columns=decided_feature, axis=1)
        tree.create_node(tag=decided_feature + '_' + k, identifier=parent + '_' + decided_feature + '_' + k,
                         parent=parent)
        Construct_Tree(new_data, parent + '_' + decided_feature + '_' + k)


Construct_Tree(train, 'root')
tree.show()

pred_list = []
for i in range(len(x_vad)):
    item = 'root'
    for j in feature_order:
        item = item + '_' + j + '_' + x_vad[j].iloc[i]

        if tree.get_node(item) == None:
            #tree.get_node(item + '_' + j)
            pred_list.append('acc')
            break
        elif tree.get_node(item+'_result') == None:
            continue
        else:
            pred_list.append(tree.get_node(item+'_result').data)
            break

def Score(pred_result, true_value):
    mistakes = 0
    acc_count = 0
    for w in range(len(pred_result)):
        if pred_result[w] != true_value[w]:
            mistakes += 1
        elif pred_result[w] == 'unacc':
            acc_count += 1
    accuracy = 1 - (mistakes / len(true_value))
    precise = acc_count / pred_result.count('unacc')
    recall = acc_count / true_value.count('unacc')
    f_measure = 2 * precise * recall / (precise + recall)
    score_dic = {'accuracy':accuracy, 'precise': precise, 'recall': recall, 'f_measure': f_measure}
    return score_dic


print(Score(pred_list,y_vad))

['acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'acc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'unacc', 'acc', 'unacc', 'unacc', 'acc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'unacc', 'unacc', 'acc', 'acc', 'unacc', 'unacc', 'acc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'u