In [3]:
import numpy as np
import pandas as pd
import random
from numpy.random import randint
from collections import Counter
%config IPCompleter.greedy=True

In [7]:
df = pd.read_csv('../data/dataset_31_credit-g.csv', nrows=20)
wine_df = pd.read_csv('../data/dataset_191_wine.csv')

In [8]:
a = df[df['installment_commitment'] == 2]

In [9]:
groups = a.groupby(df['installment_commitment'] > df['installment_commitment'].mean())

In [10]:
import json
f = open('../data/dataset_31_credit-g.json', 'r')
credits_kinds = json.load(f)
f.close()

f = open('../data/dataset_191_wine.json', 'r')
wine_kinds = json.load(f)
f.close()

In [11]:
def _random_k_folds(df, k, add_remaining, seed):
    fold_size = len(df) // k

    folds = []
    for i in range(k):
        sample = df.sample(n=fold_size, random_state=seed)
        df = df.drop(sample.index, errors='ignore')
        folds.append(sample)

    # add remaining elements to folds
    if add_remaining:
        for i in range(len(df)):
            folds[i] = pd.concat([folds[i], df.iloc[i:i+1]])

    return folds


def _stratified_k_folds(df, k, add_remaining, seed):
    groups = df.groupby('class')
    folds_by_groups = [_random_k_folds(g, k, add_remaining, seed) for c, g in groups]
    folds = [pd.concat(folds_by_groups[x][y] for x in range(len(folds_by_groups))) for y in range(k)]
    return folds


def generate_k_folds(df, k, sampling='stratified', add_remaining=True, seed=randint(10000)):
    if sampling == 'random':
        return _random_k_folds(df, k, add_remaining, seed)
    elif sampling == 'stratified':
        return _stratified_k_folds(df, k, add_remaining, seed)
    else:
        raise Exception("Sampling parameter must be one of [stratified, random]")

In [12]:
def generate_bootstraps(df, n, seed=randint(10000)):
    bootstraps = []
    for i in range(n):
        sample = df.sample(frac=1, replace=True, random_state=seed+i)
        bootstraps.append(sample)

    return bootstraps

In [13]:
def generate_splits(folds):
    sets = []
    for i, fold in enumerate(folds):
        sets.append((pd.concat(folds[:i] + folds[i + 1:]), folds[i]))

    return sets

In [14]:
import time

def timeit(method):
    def func(*args, **kw):
        start = time.time()
        result = method(*args, **kw)
        end = time.time()

        print("{}: {} seconds".format(method.__name__, (start - end)))
        return result
    return func

In [15]:
def tree_to_string(tree, depth=0):
    DEFAULT_COLOR = '\033[39m'
    ATTR_COLOR = '\033[33m'
    NOMINAL_COLOR = '\033[34m'
    NUMERIC_COLOR = '\033[31m'
    CLASS_COLOR = '\033[32m'

    spacing = "    " * depth

    if tree.target_class:
        string = CLASS_COLOR + str(tree.target_class) + DEFAULT_COLOR
        return string

    if depth == 0:
        string = ATTR_COLOR + str(tree.attribute) + DEFAULT_COLOR + ":\n"
    else:
        string = ATTR_COLOR + "\n" + spacing + str(tree.attribute) + DEFAULT_COLOR + ":\n"

    if tree.kind == "nominal":
        for option in tree.options.items():
            attr_name = NOMINAL_COLOR + str(option[0]) + DEFAULT_COLOR + ": "
            string = string + spacing + attr_name + tree_to_string(option[1], depth + 1) + "\n"
    else:
        for option in tree.options.items():
            signal = " > " if option[0] else " < "
            attr_name = NUMERIC_COLOR + str(f"{tree.cut:.4f}") + DEFAULT_COLOR + signal \
                + NOMINAL_COLOR + tree.attribute + DEFAULT_COLOR + ": "
            string = string + spacing + attr_name + tree_to_string(option[1], depth + 1) + "\n"

    string = string[:-1]
    return string

## Gain

In [16]:
from math import log2

def group_by_attribute(attribute, df):
    if attribute[1] == "nominal":
        return df.groupby(attribute[0])
    else:
        return df.groupby(df[attribute[0]] > df[attribute[0]].mean())

def info(df):
    total = len(df)
    class_counts = df['class'].value_counts().to_list()
    total_info = 0

    for c in class_counts:
        x = c / total
        total_info = total_info - x * log2(x)

    return total_info


def info_attribute(attribute, df):
    instances_by_attribute = group_by_attribute(attribute, df)
    df_size = len(df)
    total_info = 0

    for _, group in instances_by_attribute:
        group_size = len(group)
        total_info = total_info + group_size / df_size * info(group)

    return total_info


def gain(attr, df):
    return info(df) - info_attribute(attr, df)

In [17]:
from numba import jit
import random

@jit(nopython=True)
def np_info(counts, group_size, total_size):
    total_info = 0
    for c in counts:
        x = c / group_size
        total_info = total_info - x * np.log2(x)
        
    return group_size / total_size * total_info

def info_attributes_calc(data, attributes, column_index):
    class_pos = column_index['class']
    total_size = len(data)
    
    infos = []
    for attr, kind in attributes:
        if kind == "nominal":
            unique = np.unique(data[:, column_index[attr]])
            groups = [data[data[:, column_index[attr]] == u] for u in unique]
        else:
            mean = data[:, column_index[attr]].mean()
            groups = [data[data[:, column_index[attr]] <= mean], data[data[:, column_index[attr]] > mean]]
            
        attr_info = 0
        for group in groups:
            group_size = len(group)
            class_column = group[:, class_pos]
            classes, counts = np.unique(class_column, return_counts=True)
            
            attr_info = attr_info + np_info(counts, group_size, total_size)
        
        infos.append(attr_info)
    return infos

def info_attributes(df, attributes):
    column_index = {v: i for i, v in enumerate(df.columns.values)}
    data = df.values
    return info_attributes_calc(data, attributes, column_index)

In [15]:
%%timeit
test_attributes = take_m(test_kinds, 3)
info_attributes(test_df, test_attributes)

661 µs ± 43.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%%timeit
test_attributes = take_m(test_kinds, 3)
info_attributes(test_df, test_attributes)

628 µs ± 16.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
%%timeit
info_attribute(("age", "nominal"), test_df)
info_attribute(("income", "numerical"), test_df)

18.1 ms ± 3.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
def _most_frequent_class(df):
    return df['class'].value_counts().idxmax()

def take_m(attributes, m):
    attribute_list = list(attributes.items())
    if m and len(attribute_list) >= m:
        attribute_list = random.sample(attribute_list, m)
    return attribute_list
    
def _choose_best_attribute(attributes, df, m):
    attribute_list = take_m(attributes, m)
    results = info_attributes(df, attribute_list)
    choice = attribute_list[results.index(min(results))]
    return choice


class Tree:
    def __init__(self, options=None, target_class=None, attribute=None, kind=None, cut=None):
        self.target_class = target_class
        self.options = options
        self.attribute = attribute
        self.kind = kind
        self.cut = cut

    def __str__(self):
        return tree_to_string(self)

    @classmethod
    def generate(cls, df, attributes, m=None):
        if df['class'].nunique() == 1:
            return Tree(target_class=df['class'].iloc[0])

        elif not attributes:
            return Tree(target_class=_most_frequent_class(df))

        else:
            best_attribute = _choose_best_attribute(attributes, df, m)
            name, kind = best_attribute

            groups = group_by_attribute(best_attribute, df)
            new_attributes = {k: v for k, v in attributes.items() if k != name}

            def gen_options():
                return {c: cls.generate(group, new_attributes, m) for c, group in groups}

            cut = df[name].mean() if kind == "numeric" else None

            return Tree(
                attribute=name,
                kind=kind,
                options=gen_options(),
                cut=cut
            )


def predict(tree, instance):
    if tree.target_class:
        return tree.target_class

    try:
        if tree.kind == "nominal":
            sub_tree = tree.options[instance[tree.attribute]]
        else:
            sub_tree = tree.options[instance[tree.attribute] > tree.cut]
    except KeyError:
        print("Instance attribute has no class in tree node, using first option available")
        sub_tree = next(iter(tree.options.values()))

    return predict(sub_tree, instance)

In [19]:
tree_predict = predict

class Forest:
    def __init__(self, trees):
        self.trees = trees

    @classmethod
    def generate(cls, train_set, attributes, ntree, m=None, pool=None):
        bootstraps = generate_bootstraps(train_set, ntree)
        wraped_bootstraps = [(b, attributes, m) for b in bootstraps]
        if pool:
            trees = pool.starmap(Tree.generate, wraped_bootstraps)
        else:
            trees = [Tree.generate(b, attributes, m) for b in bootstraps]

        return Forest(trees)

    def predict(self, instance, pool=None):
        wraped_trees = [(tree, instance) for tree in self.trees]
        if pool:
            results = pool.starmap(tree_predict, wraped_trees)
        else:
            results = [tree_predict(tree, instance) for tree in self.trees]

        data = Counter(results)
        result = max(results, key=data.get)
        return result

    def predict_df(self, instances):
        instances['predicted'] = instances.apply(lambda x: self.predict(x), axis=1)
        return instances[['class', 'predicted']]

In [20]:
k_folds = generate_k_folds(wine_df, 10)
splits = generate_splits(k_folds)
train, test = splits[1]

In [21]:
wine_forest = Forest.generate(train, wine_kinds, 5)

In [22]:
results = wine_forest.predict_df(splits[7][1])
results2 = wine_forest.predict_df(splits[2][1])
print("Total: {}, classified correctly: {}".format(len(results), len(results[results['predicted'] == results['class']])))
print("Total: {}, classified correctly: {}".format(len(results2), len(results2[results2['predicted'] == results2['class']])))

Total: 18, classified correctly: 18
Total: 18, classified correctly: 18


In [23]:
df.values.shape[0]

20

In [24]:
values, counts = np.unique(np.array(['c', 'a', 'b', 'b', 'c', 'c']), return_counts=True)
values[counts.argmax()]

'c'

In [25]:
class ConfusionMatrix:
    def __init__(self, results):
        self._results = results
        self._total = len(results)
        self._correct = len(results[results['predicted'] == results['class']])
        self._cm = pd.crosstab(results['class'], results['predicted'], rownames=['actual'])

    def __add__(self, val):
        new_results = pd.concat([self._results, val.results])
        return ConfusionMatrix(new_results)

    def __str__(self):
        return self._cm.__repr__()

    def print(self):
        print(self._cm)

    def true_positives(self):
        tps = pd.Series(np.diag(self._cm), index=self._cm.index)
        return tps.rename_axis('TruePositives')

    def true_negatives(self):
        tns = [self._cm.drop(index=[c], columns=[c]).values.sum() for c in self._cm.index]
        return pd.Series(tns, index=self._cm.index).rename_axis('TrueNegatives')

    def false_positives(self):
        fps = [self._cm.loc[c].sum() - self._cm[c][c] for c in self._cm.index]
        return pd.Series(fps, index=self._cm.index).rename_axis('FalsePositives')

    def false_negatives(self):
        fns = [self._cm[c].sum() - self._cm[c][c] for c in self._cm.index]
        return pd.Series(fns, index=self._cm.index).rename_axis('FalseNegatives')

    def accuracy(self):
        return self._correct / self._total

    def error(self):
        return 1 - self.accuracy()

    def recalls(self):
        tps = self.true_positives()
        fns = self.false_negatives()
        recalls = tps / (tps + fns)
        return recalls.rename_axis('Recall')

    def precisions(self):
        tps = self.true_positives()
        fps = self.false_positives()
        precisions = tps / (tps + fps)
        return precisions.rename_axis('Precision')

    def specificities(self):
        tns = self.true_negatives()
        fps = self.false_positives()
        specificities = tns / (tns + fps)
        return specificities.rename_axis('Specificity')

    def f_measures(self, b):
        prec = self.precisions()
        rec = self.recalls()
        f_measures = ((1 + b**2) * prec * rec / (b**2 * prec + rec))
        return f_measures.rename_axis('F-measure')

    def macro_recall(self):
        return self.recalls().mean()

    def macro_precision(self):
        return self.precisions().mean()

    def macro_specificity(self):
        return self.specificities().mean()

    def macro_f_measure(self, b):
        return self.f_measures(b).mean()

    def micro_recall(self):
        tps = self.true_positives().sum()
        fns = self.false_negatives().sum()
        recall = tps / (tps + fns)
        return recall

    def micro_precision(self):
        tps = self.true_positives().sum()
        fps = self.false_positives().sum()
        precision = tps / (tps + fps)
        return precision

    def micro_f_measure(self, b):
        prec = self.precisions().sum()
        rec = self.recalls().sum()
        f_measure = ((1 + b**2) * prec * rec / (b**2 * prec + rec))
        return f_measure

    def show(self, verbose=False):
        print(f"Accuracy: {self.accuracy():.3f} [Total: {self._total}, Correct: {self._correct}]")
        print(f"Macro Recall: {self.macro_recall():.3f}")
        if verbose:
            for k, v in self.recalls().items():
                print(f"  Recall for class {k}: {v:.3f}")
        print(f"Macro Precision: {self.macro_precision():.3f}")
        if verbose:
            for k, v in self.precisions().items():
                print(f"  Precision for class {k}: {v:.3f}")
        print(f"Macro Specificity: {self.macro_specificity():.3f}")
        if verbose:
            for k, v in self.specificities().items():
                print(f"  Specificity for class {k}: {v:.3f}")
        for b in [0.5, 1, 2]:
            print(f"Macro F-measure (ß = {b}): {self.macro_f_measure(b):.3f}")
            if verbose:
                for k, v in self.f_measures(b).items():
                    print(f"  F-measure (ß = {b}) for class {k}: {v:.3f}")
            
cm = ConfusionMatrix(results)
cm.show(verbose=True)

Accuracy: 1.000 [Total: 18, Correct: 18]
Macro Recall: 1.000
  Recall for class 1: 1.000
  Recall for class 2: 1.000
  Recall for class 3: 1.000
Macro Precision: 1.000
  Precision for class 1: 1.000
  Precision for class 2: 1.000
  Precision for class 3: 1.000
Macro Specificity: 1.000
  Specificity for class 1: 1.000
  Specificity for class 2: 1.000
  Specificity for class 3: 1.000
Macro F-measure (ß = 0.5): 1.000
  F-measure (ß = 0.5) for class 1: 1.000
  F-measure (ß = 0.5) for class 2: 1.000
  F-measure (ß = 0.5) for class 3: 1.000
Macro F-measure (ß = 1): 1.000
  F-measure (ß = 1) for class 1: 1.000
  F-measure (ß = 1) for class 2: 1.000
  F-measure (ß = 1) for class 3: 1.000
Macro F-measure (ß = 2): 1.000
  F-measure (ß = 2) for class 1: 1.000
  F-measure (ß = 2) for class 2: 1.000
  F-measure (ß = 2) for class 3: 1.000


In [26]:
cm = ConfusionMatrix(results)

In [41]:
str(np.unique(cm._results.values[:,0]))

'[1 2 3]'

In [51]:
str(cm.recalls().to_list())

'[1.0, 1.0, 1.0]'

In [44]:
test_tree = Tree.generate(test_df, test_kinds)
wine_tree = Tree.generate(wine_df, wine_kinds)

In [45]:
vars(test_tree.options['senior'])

{'target_class': None,
 'options': {False: <__main__.Tree at 0x10d655e10>,
  True: <__main__.Tree at 0x10d5b7550>},
 'attribute': 'income',
 'kind': 'numeric',
 'cut': 6370.0}

In [54]:
print(wine_tree)

[33mFlavanoids[39m:
[31m2.0293[39m < [34mFlavanoids[39m: [33m
    Hue[39m:
    [31m0.8378[39m < [34mHue[39m: [33m
        Color_intensity[39m:
        [31m7.3911[39m < [34mColor_intensity[39m: [33m
            OD280%2FOD315_of_diluted_wines[39m:
            [31m1.6965[39m < [34mOD280%2FOD315_of_diluted_wines[39m: [32m3[39m
            [31m1.6965[39m > [34mOD280%2FOD315_of_diluted_wines[39m: [33m
                Alcohol[39m:
                [31m12.9350[39m < [34mAlcohol[39m: [33m
                    Malic_acid[39m:
                    [31m3.3925[39m < [34mMalic_acid[39m: [32m3[39m
                    [31m3.3925[39m > [34mMalic_acid[39m: [32m2[39m
                [31m12.9350[39m > [34mAlcohol[39m: [32m3[39m
        [31m7.3911[39m > [34mColor_intensity[39m: [32m3[39m
    [31m0.8378[39m > [34mHue[39m: [33m
        Malic_acid[39m:
        [31m1.9500[39m < [34mMalic_acid[39m: [32m2[39m
        [31m1.9500[39m > [34mM