In [18]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

%matplotlib inline

In [8]:
DATASET_PATH = "datasets/"
datasets = [f for f in listdir(DATASET_PATH) if isfile(join(DATASET_PATH, f)) and f != '.DS_Store']
datasets

['primary-tumor.data.txt',
 'hayes-roth.data.txt',
 'monks-2.test.txt',
 'lymphography.data.txt',
 'soybean-large.data.txt',
 'SPECT.test.txt',
 'breast-cancer.data.txt',
 'balance-scale.data.txt',
 'house-votes-84.data.txt',
 'tic-tac-toe.data.txt']

In [97]:
data_dict_list = []
for dataset in datasets: # datasets[7:-1]:
    get_info_for_data(dataset, data_dict_list, 2)

primary-tumor.data.txt : Gathering info.
   0   1   5   6   7   8   9   10  11  13  14  16  17
0   1   1   2   2   1   2   2   2   2   2   2   2   2
1   1   1   2   2   2   2   2   1   2   2   1   1   2
2   1   1   1   2   2   2   2   2   2   2   2   1   2
3   1   1   1   2   1   1   2   2   2   2   2   1   2
4   1   1   1   2   1   1   2   2   2   2   2   1   2
Accuracy:  0.2658959537572254
Accuracy:  0.2774566473988439
Accuracy:  0.25301204819277107
Accuracy:  0.2469879518072289
hayes-roth.data.txt : Gathering info.
     0  1  2  3  4  5
0   92  2  1  1  2  1
1   10  2  1  3  2  2
2   83  3  1  4  1  3
3   61  2  4  2  2  3
4  107  1  1  3  4  3




ValueError: n_splits=2 cannot be greater than the number of members in each class.

In [94]:
def get_info_for_data(dataset_name, data_dict_list, n_splits=10):
    print(dataset_name, ": Gathering info.")
    data_dict = {}
    
    data = pd.read_csv(DATASET_PATH + dataset_name, header = None)
    data = data.replace('?', np.NaN)
    data = data.dropna(axis=1)
    
    print(data.head())

    data_dict = {"name": dataset_name, "num_instances": data.shape[0], "num_classes": data.shape[1]}

    # number of attributes, Random average height, and Random average accuracy
    
    y_col = 0
    X = data.drop(y_col, 1)
    y = data[y_col]
    skf = StratifiedKFold(n_splits=n_splits)
    accuracies = []
    tree_depths = []
    accuracies2 = []
    tree_depths2 = []
    data_split = skf.split(X, y)
    for train, test in data_split:
        
        # C4.5 Tree:
        
        model = DecisionTreeClassifier(criterion='entropy')
        model.fit(X.loc[train] , y.loc[train])
        y_predict = model.predict(X.loc[test])
        accuracy = accuracy_score(y.loc[test], y_predict)
        accuracies.append(accuracy)
        print("Accuracy: ", accuracy)
        tree_depth = dectree_max_depth(model.tree_)
        tree_depths.append(tree_depth)
        
        # Random Tree:
        
        model2 = ExtraTreeClassifier(criterion='entropy', max_features=1)
        model2.fit(X.loc[train] , y.loc[train])
        y_predict = model2.predict(X.loc[test])
        accuracy = accuracy_score(y.loc[test], y_predict)
        accuracies2.append(accuracy)
        print("Accuracy: ", accuracy)
        tree_depth = dectree_max_depth(model2.tree_)
        tree_depths2.append(tree_depth)
        
    data_dict["C4.5 Avg Accuracy"] = sum(accuracies) / len(accuracies)
    data_dict["C4.5 Avg Height"] = sum(tree_depths) / len(tree_depths)
    data_dict["Random Avg Accuracy"] = sum(accuracies2) / len(accuracies2)
    data_dict["Random Avg Height"] = sum(tree_depths2) / len(tree_depths2)
    data_dict_list.append(data_dict)

In [72]:
def dectree_max_depth(tree):
    n_nodes = tree.node_count
    children_left = tree.children_left
    children_right = tree.children_right

    def walk(node_id):
        if (children_left[node_id] != children_right[node_id]):
            left_max = 1 + walk(children_left[node_id])
            right_max = 1 + walk(children_right[node_id])
            return max(left_max, right_max)
        else: # leaf
            return 1

    root_node_id = 0
    return walk(root_node_id)
# Use: t.tree_
# From: https://stackoverflow.com/questions/26602369/how-to-find-out-the-size-of-a-sklearn-decision-tree-in-python

### Sources

http://www.cs.uvm.edu/~icdm/algorithms/10Algorithms-08.pdf

"CART uses the Gini diversity index to rank tests, whereas C4.5 uses information-based
criteria"

https://scikit-learn.org/stable/modules/tree.html#tree-algorithms-id3-c4-5-c5-0-and-cart

"CART (Classification and Regression Trees) is very similar to C4.5, but it differs in that it supports numerical target variables (regression) and does not compute rule sets. "
scikit-learn uses an optimised version of the CART algorithm