In [62]:
import csv
import random
import math
import copy
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
def run_testing(testing, tree, titles):
    incorrect = {"vgood good": 0, "vgood acc": 0,"vgood unacc": 0,
                  "good vgood": 0, "good acc": 0, "good unacc": 0,
                  "acc vgood": 0, "acc good": 0, "acc unacc": 0,
                 "unacc vgood": 0, "unacc good": 0, "unacc acc": 0}

    correct = {"vgood": 0, "good": 0, "acc": 0, "unacc": 0}

    for i, row in enumerate(testing):
        # for pool of predictions
        # print("Actual: %s. Predicted: %s" % (row[-1], print_leaf(classify([titles, row], tree))))
        predicted = classify([titles, row], tree)
        if predicted is None:
            continue
        print("Actual: %s. Predicted: %s" % (row[-1], predicted))
        if predicted == row[-1]:
            correct[predicted] += 1;
        else:
            if str(row[-1]) + ' ' + predicted in incorrect.keys():
                incorrect[str(row[-1]) + ' ' + predicted] += 1
    return correct, incorrect

In [63]:
def info(data):
    total = 0
    for value in data:
        total += (-value)/sum(data) * math.log(value/sum(data), 2)
    return round(total, 4)

In [64]:
def info_a(X_train, y_train, i):
    # 
    total = 0
    target_column_counts = Counter(X_train.iloc[:,i])
    item_names = [value[0] for value in target_column_counts.items()]
    item_counts = [value[1] for value in target_column_counts.items()]
    for j, item_name in enumerate(item_names):
        categoryClasses = Counter(y for x, y in zip(X_train.iloc[:,3], y_train) if x == item_name)
        test = [value[1] for value in categoryClasses.items()]
        total += item_counts[j]/sum(item_counts) * info(test)
    return total

In [65]:

def gain(dictionary, class_entropy):
    gain = 0
    best_attribute = ""
    for attribute in dictionary:
        if class_entropy - dictionary[attribute] >= gain:
            gain = class_entropy - dictionary[attribute]
            best_attribute = attribute
    return best_attribute

In [66]:
def get_root(X_train, y_train):
    # Counts values of target attribute
    classifier = Counter(y_train)
    # Seperate counts into list of values
    class_values = [value[1] for value in classifier.items()]
    # Calculate entropy for the dataset
    class_entropy = info(class_values)
    dictionary = {}
    for i in range(len(X_train.columns)):
        dictionary[i] = info_a(X_train, y_train, i)
    return gain(dictionary, class_entropy)


In [67]:
class Decision_Node:
    def __init__(self, branches):
        self.branches = branches

In [68]:
class Leaf_Node:
    def __init__(self, X_train, y_train):
        total_count = len(X_train)
        probability = Counter(y_train)
        for key in probability:
            probability[key] = round((probability[key] / total_count), 3)
        self.predictions = probability

In [70]:
def print_tree(node, spacing=""):
    if isinstance(node, Leaf_Node):
        print(spacing + "Predict", node.predictions)
        return

    #print(spacing + "CHECK: " + str(node.best_attr))


    for branch in node.branches:
        print(spacing + "--> IF " + branch)
        print_tree(node.branches[branch], spacing + "  ")

In [71]:
def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

In [72]:
def classify(data, node):
    # Base case: we've reached a leaf
    if isinstance(node, Leaf_Node):
        # Uncomment if you would like to see pool of prediction
        #return node.predictions

        # Most likely prediction
        temp = sorted(node.predictions.items(), key=lambda x: x[1], reverse=True)
        return temp[0][0]

    #recursion through each dicision node
    for branch in node.branches:
        decision = branch.split()
        index = data[0].index(decision[0])
        if data[1][index] == decision[1]:
            return classify(data, node.branches[branch])

In [73]:
def accuracy(correct, testSize):
    return correct / testSize * 100

In [79]:
def main():
    #import data
    df = pd.read_csv("preprocessed_age.csv")
    

    # Attribute for classification
    target_column = "Manner of death"
    # Remove NaN
    df = df.dropna(subset=[target_column])

    df = df.drop(["Unnamed: 0", "Id", "Death year"], axis=1)

    # Covert values to numerical
    le = LabelEncoder()
    for column in df.columns:
        if (df[column].dtype != "int64"):
            df[column] = le.fit_transform(df[column])

    # Split data (train/test)
    y = df[target_column]
    X = df.drop(target_column, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95)

    root = get_root(X_train, y_train)
    print(root)
    titles = X_train.columns
    print(len(X_train))
    print(len(y_train))
    tree = build_tree(X_train, y_train, 6, titles)
    print_tree(tree)
    # accuracy_plot = []
    # size_plot = []
    # #loop through sample sizes - increment 100
    # for i in range(1, int(len(X_train) / 100)):
    #     size = 100 * i
    #     if size > len(X_train):
    #         size = len(X_train)
    #     temp = random.sample(X_train, k=size)
    #     root = get_root(temp)
    #     tree = build_tree(temp, root, titles)
    #     correct, incorrect = run_testing(testing, tree, titles)
    #     accuracy_plot.append(accuracy(sum(correct.values()), len(testing)))
    #     size_plot.append(size)

    # #create graph
    # x1 = size_plot
    # y1 = accuracy_plot

    # plt.plot(x1, y1, marker='o')
    # plt.title("Learning Curve")
    # plt.xlabel("Number of Samples")
    # plt.ylabel("Accuracy %")
    # plt.show()

    # #final tree
    # root = get_root(training)
    # tree = build_tree(training, root, titles)

    # # Uncomment to print tree
    # print_tree(tree)
    # correct, incorrect = run_testing(testing, tree, titles)
    # print("\n--Complete--\n\nTotal accuracy: %.2f" % (accuracy(sum(correct.values()), len(testing))), "%")
    # confusion_matrix(correct, incorrect)
    # precision = calc_precision(correct, incorrect)
    # recall = calc_recall(correct, incorrect)
    # f1_score(correct, precision, recall)



In [78]:
def build_tree(X_train, y_train, x, titles):
    class_values = Counter(y_train)
    class_count = [value[1] for value in class_values.items()]
    class_entropy = info(class_count)

    #adjust value to prune tree
    if class_entropy <= 0.5:
        #create leaf labeelled by majority class in x
        return Leaf_Node(X_train, y_train)

    # Splitting node into attribute values
    attribute_values = Counter(X_train.iloc[:, x])
    subNode = {}
    for branch in attribute_values:

        # Splices data (removes all values that arent branch)

        f_X_train = copy.deepcopy(X_train)
        f_Y_train = copy.deepcopy(y_train)
        f_Y_train = f_Y_train[f_X_train.iloc[:,x] == branch]
        f_X_train = f_X_train[f_X_train.iloc[:,x] == branch]
        
        print("X_train length:", len(f_X_train), "\ny_train length:", len(f_Y_train))
        

        # Counter of classifier for branch
        new_values = Counter(f_Y_train)
        new_values_count = [value[1] for value in new_values.items()]

        split_entropy = info(new_values_count)
        dictionary = {}
        # Drop x column
        f_X_train = f_X_train.drop(f_X_train.columns[x], axis=1)
        n_titles = copy.deepcopy(titles)
        n_titles = n_titles.drop(n_titles[x])
        for i in range(len(titles)):
            dictionary[i] = info_a(X_train, y_train, i)
        subNode[titles[x] + " " + branch] = build_tree(X_train, y_train, gain(dictionary, split_entropy), n_titles)
    return Decision_Node(subNode)

main()

6
2443
2443
X_train length: 36 
y_train length: 36
X_train length: 36 
y_train length: 36


IndexError: index 6 is out of bounds for axis 0 with size 6

In [None]:
df = pd.read_csv("preprocessed_age.csv")
df.dtypes

Unnamed: 0                             int64
Id                                    object
Name                                  object
Gender                                object
Occupation                            object
Birth year                             int64
Death year                             int64
Manner of death                       object
Age of death                           int64
Associated Countries                  object
Associated Country Life Expectancy    object
dtype: object