# Comparing hyr classification with flat classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import datasets, neighbors, metrics, tree, svm, preprocessing, model_selection, ensemble
from sklearn.base import clone as sklearn_clone
from sklearn.model_selection import StratifiedKFold
from pprint import pprint

## Loading the data
This data has been preprocessed in pickle.ipynb and stored in .pkl files.
This makes for much faster loading

In [None]:
%%time
df = pd.read_pickle("../data/Lauren/bam_df.pkl")
labels = pd.read_pickle("../data/Lauren/bam_labels.pkl")

CPU times: user 0 ns, sys: 2.63 s, total: 2.63 s
Wall time: 3.95 s


## Metrics

In [None]:
def calc_metrics(y_test, y_pred, f1_file_name=None):
    unique_labels_df = pd.DataFrame(pd.Series(y_test).unique())
    f1 = pd.DataFrame(metrics.f1_score(y_test, y_pred, average=None, labels = unique_labels_df[0]))
    acc = metrics.accuracy_score(y_test, y_pred)

    print(f"accuracy: {acc}")
    f1_micro = metrics.f1_score(y_test, y_pred, average='micro')
    f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
    f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')
    print(f"F1 micro-average: {f1_micro}")
    print(f"F1 macro-average: {f1_macro}")
    print(f"F1 weighted-average: {f1_weighted}")
    print()
    
    # seperate f1 score for each label
    f1_labeled = pd.concat([unique_labels_df[0], f1[0]], axis=1, keys=['class', 'f1_per_class'])
    # print(f1_labeled)
    
    # save seperate f1 scores
    if f1_file_name is not None:
        f1_labeled.to_csv(f1_file_name, index=False)
    
    return pd.DataFrame([[acc, f1_micro, f1_macro, f1_weighted]], columns=["accuracy", "F1 micro-average", "F1 macro-average", "F1 weighted-average"])
    

## Flat classification

In [None]:
# train a flat multi label classifier
# df: X
# labels: multiple columns, one of which will be y
# class_column_name: the name of the column in labels that will be y
def train_flat(clf, df, labels, class_column_name): # todo: give linear classifier as argument
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
     
    X_train, y_train = df, labels
    # copy the model, important for hierarchical
    clf = sklearn_clone(clf)
    clf.fit(X_train, y_train)
    
    return clf
    

In [None]:
def flat(clf, X, y, on_label="cluster", folds=2, folds_random_state=1337):
    # use this to split dataset in [folds] parts, test and train
    skf = StratifiedKFold(n_splits=folds, random_state=folds_random_state, shuffle=folds_random_state is not None)

    k = 0
    accs = pd.DataFrame()
    for train_index, test_index in skf.split(X, y[on_label]):

        # get train and test set
        X_train, X_test = X.take(train_index), X.take(test_index)
        y_train, y_test = y.take(train_index), y.take(test_index)

        # train the flat classifier
        print(f"Start training   fold {k}")
        flat_clf = train_flat(clf, X_train, y_train, on_label)

        # predicting
        print(f"Start predicting fold {k}")
        y_pred_flat = flat_clf.predict(X_test)

        # metrics
        accs = accs.append(calc_metrics(y_test[on_label], y_pred_flat, f"../results/flat_clf_f1-{k}.csv"))
        k += 1
    
    return accs.mean()

In [None]:
%%time

acc = flat(svm.LinearSVC(max_iter=2000), df, labels)

print("\nAVERAGE ACCURACY AND F1 SCORES:")
print(acc)

Start training   fold 0
Start predicting fold 0
accuracy: 0.9023626975434205
F1 micro-average: 0.9023626975434204
F1 macro-average: 0.8430137881777549
F1 weighted-average: 0.9004396258585396

Start training   fold 1
Start predicting fold 1
accuracy: 0.9009389671361502
F1 micro-average: 0.9009389671361502
F1 macro-average: 0.8412668929414026
F1 weighted-average: 0.8988779275416249


AVERAGE ACCURACY AND F1 SCORES:
accuracy               0.901651
F1 micro-average       0.901651
F1 macro-average       0.842140
F1 weighted-average    0.899659
dtype: float64
CPU times: user 12min 1s, sys: 31.5 s, total: 12min 32s
Wall time: 13min 23s


## Hyr classification

In [None]:
# make tree structure
class Node:
    def __init__(self, parent, class_name):
        self.parent = parent
        self.class_name = class_name
        
        self.clf = None
        self.children = dict() # dict die resultaat van clf linkt aan een nieuwe node (met clf)
        
    def __str__(self):
        if self.parent is None:
            return "Root"
        return f"(class_name: {self.class_name}, parent: {self.parent})"
    
    def __repr__(self):
        return self.__str__()

In [None]:
## hyr training function
## this builds the hyr tree using the flat classification function train_flat many times
## the hyr tree consists of nodes which each have (except leaf nodes) a flat classifier which classifies into it's children

def filter_data_on_class_name(df, labels, class_name, class_column_name):
    keep_indices = labels[class_column_name] == class_name
    return (df[keep_indices], labels[keep_indices])

def train_hyr(clf, df, labels, node, parent_class=None, parent_class_column=None):
    # train neural net to classify input in the child classes
    
    # get the child_class_column
    if (parent_class is None or parent_class_column is None):
        child_class_column = labels.columns[0]
    else:
        # make data smaller: remove all entries that do not belong to the parent_class
        
        df, labels = filter_data_on_class_name(df, labels, parent_class, parent_class_column)
        
        # get child_class_column
        child_class_column_index = list(labels.columns).index(parent_class_column) + 1
        if child_class_column_index >= len(labels.columns):
            # we are at in a leaf of the hyr tree, there are no further child classes
            return None
        child_class_column = labels.columns[child_class_column_index]
    
    
    # neural net that further classifies entries
    unique_labels = pd.Series(labels[child_class_column]).unique()
    if len(unique_labels) == 1:
        # the subclass is the same as the parent class
        node.clf = None
    else:
        # print()
        # print(f"parent_class: {parent_class}")
        node.clf = train_flat(clf, df, labels, child_class_column)
    
    # recursive step
    for child_class in unique_labels: # todo: parallelize
        child_node = Node(node, child_class)
        train_hyr(clf, df, labels, child_node, child_class, child_class_column)
        node.children[child_class] = child_node


In [None]:
## hyr predicting function
## given the hyr tree and an input, predict the cluster

# recursive
def predict_hyr(node, X_test):
    #### Printing
    spaces = 1
    it_node = node
    while it_node.parent is not None:
        it_node = it_node.parent
        spaces += 2
    # print((spaces*"--") + f"{node.class_name if node.parent is not None else 'Root' }")
    ####
    
    # the tree goes further down, but there is only 1 subclass and thus no further classifier needs to be executed
    if node.clf is None:
        child_node = list(node.children.values())[0]
        y_test = pd.DataFrame(index=X_test.index, columns=[0]).fillna(child_node.class_name) 
        #print(child_node.class_name)
    else:
        y_test = pd.DataFrame(node.clf.predict(X_test))
        y_test.index = X_test.index # keep original indices
    
    # we are in a leaf when the children dont have any children themselves
    # (We dont need to call predict on a child if they wont be able to futher classify to their children
    if list(node.children.values())[0].children == {}:
        return y_test
    else:
        # the children do have a clf to further classify, so further classify
        predictions = []
        for label, child_node in node.children.items():
            new_X_test = X_test[y_test[0] == label]
            predictions.append(predict_hyr(child_node, new_X_test))
        return pd.concat(predictions)
    
    

In [None]:
def hyr(clf, X, y, on_label="cluster", folds=2, folds_random_state=1337):
    # use this to split dataset in 2 parts, test and train
    skf = StratifiedKFold(n_splits=folds, random_state=folds_random_state, shuffle=folds_random_state is not None)

    k = 0
    accs = pd.DataFrame()
    for train_index, test_index in skf.split(X, y[on_label]):

        # get train and test set
        X_train, X_test = X.take(train_index), X.take(test_index)
        y_train, y_test = y.take(train_index), y.take(test_index)

        # train the flat classifier
        print("Start training")
        root = Node(None, "")
        train_hyr(clf, X_train, y_train, root)

        print(root)
        # predicting
        print("Start predicting")
        y_pred_hyr = predict_hyr(root, X_test).sort_index(ascending=True) # sort data to calculate metrix

        # metrics
        accs = accs.append(calc_metrics(y_test[on_label], y_pred_hyr[0], f"../results/hyr_clf_f1-{k}.csv"))
        k += 1

    return accs.mean()

In [None]:
%%time

accs = hyr(svm.LinearSVC(max_iter=2000), df, labels)

print("\nAVERAGE ACCURACY AND F1 SCORES:")
print(accs)

Start training
Root
Start predicting
accuracy: 0.9018932874354562
F1 micro-average: 0.9018932874354562
F1 macro-average: 0.8465664717429424
F1 weighted-average: 0.8997948064272631

Start training
Root
Start predicting
accuracy: 0.9020344287949922
F1 micro-average: 0.9020344287949922
F1 macro-average: 0.8420932014001663
F1 weighted-average: 0.9003196202455704


AVERAGE ACCURACY AND F1 SCORES:
accuracy               0.901964
F1 micro-average       0.901964
F1 macro-average       0.844330
F1 weighted-average    0.900057
dtype: float64
CPU times: user 8min 30s, sys: 1min 16s, total: 9min 46s
Wall time: 10min 52s
