# Comparing hyr classification with flat classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import datasets, neighbors, metrics, tree, svm, preprocessing, model_selection, ensemble
from sklearn.base import clone as sklearn_clone
from sklearn.model_selection import StratifiedKFold
from pprint import pprint

## Loading the data
This data has been preprocessed in pickle.ipynb and stored in .pkl files.
This makes for much faster loading

In [2]:
%%time
df = pd.read_pickle("../data/Lauren/covid_df.pkl")
labels = pd.read_pickle("../data/Lauren/covid_labels.pkl")


CPU times: user 0 ns, sys: 1.88 s, total: 1.88 s
Wall time: 1.92 s


## Metrics

In [3]:
def calc_metrics(y_test, y_pred, f1_file_name=None):
    unique_labels_df = pd.DataFrame(pd.Series(y_test).unique())
    f1 = pd.DataFrame(metrics.f1_score(y_test, y_pred, average=None, labels = unique_labels_df[0]))
    acc = metrics.accuracy_score(y_test, y_pred)

    print(f"accuracy: {acc}")
    f1_micro = metrics.f1_score(y_test, y_pred, average='micro')
    f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
    f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')
    print(f"F1 micro-average: {f1_micro}")
    print(f"F1 macro-average: {f1_macro}")
    print(f"F1 weighted-average: {f1_weighted}")
    print()
    
    # seperate f1 score for each label
    f1_labeled = pd.concat([unique_labels_df[0], f1[0]], axis=1, keys=['class', 'f1_per_class'])
    # print(f1_labeled)
    
    # save seperate f1 scores
    if f1_file_name is not None:
        f1_labeled.to_csv(f1_file_name, index=False)
    
    return pd.DataFrame([[acc, f1_micro, f1_macro, f1_weighted]], columns=["accuracy", "F1 micro-average", "F1 macro-average", "F1 weighted-average"])
    

## Flat classification

In [4]:
def flatify(multi_y):
    try:
        if multi_y.shape[1] == 1:
            return multi_y
    except:
        return multi_y

    def most_specific_class(row):
        empty_found = False
        for i in range(len(row)):
            if row.iloc[i] is None or row.iloc[i] == "":
                empty_found = True
                break
        return row.iloc[i-1] if empty_found else row.iloc[i]
    return multi_y.apply(lambda row: most_specific_class(row), axis=1)

In [5]:
# train a flat multi label classifier
# df: X
# labels: multiple columns
# class_column_name: the name of the column in labels that will be y
def train_flat(clf, X_train, labels, class_column_name=None): # todo: give linear classifier as argument
    if class_column_name is None:
        y_train = flatify(labels)
    else:
        # only keep the needed column
        drop_columns = filter(lambda col: col != class_column_name , labels.columns)
        y_train = labels.drop(columns = drop_columns).values.ravel()
    
    # copy the model, important for hierarchical
    clf = sklearn_clone(clf)
    clf.fit(X_train, y_train)
    
    return clf
    

In [6]:
def flat(clf, X, y, folds=2, folds_random_state=1337):
# use this to split dataset in [folds] parts, test and train
# folds = 5
# folds_random_state = 1337
# y = labels
# X = df
# clf = svm.LinearSVC(max_iter=2000)

    skf = StratifiedKFold(n_splits=folds, random_state=folds_random_state, shuffle=folds_random_state is not None)

    k = 0
    accs = pd.DataFrame()
    y_flat = flatify(y)
    for train_index, test_index in skf.split(X, y_flat):

        # get train and test set
        X_train, X_test = X.take(train_index), X.take(test_index)
        y_train, y_test = y_flat.take(train_index), y_flat.take(test_index)

        # train the flat classifier
        print(f"Start training   fold {k}")
        flat_clf = train_flat(clf, X_train, y_train)

        # predicting
        print(f"Start predicting fold {k}")
        y_pred_flat = flat_clf.predict(X_test)

        # metrics
        accs = accs.append(calc_metrics(y_test, y_pred_flat, f"../results/flat_clf_f1-{k}.csv"))

        # break

        k += 1

    return accs.mean()

In [7]:
%%time
acc = flat(svm.LinearSVC(max_iter=10000), df, labels, folds=2)

print("\nAVERAGE ACCURACY AND F1 SCORES:")
print(acc)

Start training   fold 0




Start predicting fold 0
accuracy: 0.661954261954262
F1 micro-average: 0.661954261954262
F1 macro-average: 0.47913315198013023
F1 weighted-average: 0.6541929299806176

Start training   fold 1




Start predicting fold 1
accuracy: 0.655093555093555
F1 micro-average: 0.655093555093555
F1 macro-average: 0.4754821522417491
F1 weighted-average: 0.6489203160856086


AVERAGE ACCURACY AND F1 SCORES:
accuracy               0.658524
F1 micro-average       0.658524
F1 macro-average       0.477308
F1 weighted-average    0.651557
dtype: float64
CPU times: user 16min 53s, sys: 7.77 s, total: 17min 1s
Wall time: 16min 50s


## Hyr classification

In [8]:
# make tree structure
class Node:
    def __init__(self, parent, class_name):
        self.parent = parent
        self.class_name = class_name
        
        self.clf = None
        self.children = dict() # dict die resultaat van clf linkt aan een nieuwe node (met clf)
        
    def __str__(self):
        if self.parent is None:
            return "Root"
        return f"(class_name: {self.class_name}, parent: {self.parent})"
    
    def __repr__(self):
        return self.__str__()

In [9]:
## hyr training function
## this builds the hyr tree using the flat classification function train_flat many times
## the hyr tree consists of nodes which each have (except leaf nodes) a flat classifier which classifies into it's children

def filter_data_on_class_name(df, labels, class_name, class_column_name):
    keep_indices = labels[class_column_name] == class_name
    return (df[keep_indices], labels[keep_indices])

def train_hyr(clf, df, labels, node, parent_class=None, parent_class_column=None):
    # train neural net to classify input in the child classes
    
    # get the child_class_column
    if (parent_class is None or parent_class_column is None):
        child_class_column = labels.columns[0]
    else:
        # make data smaller: remove all entries that do not belong to the parent_class
        
        df, labels = filter_data_on_class_name(df, labels, parent_class, parent_class_column)
        
        # get child_class_column
        child_class_column_index = list(labels.columns).index(parent_class_column) + 1
        if child_class_column_index >= len(labels.columns):
            # we are at in a leaf of the hyr tree, there are no further child classes
            return None
        child_class_column = labels.columns[child_class_column_index]
    
    
    # neural net that further classifies entries
    unique_labels = pd.Series(labels[child_class_column]).unique()
    if len(unique_labels) == 1:
        # the subclass is the same as the parent class
        node.clf = None
    else:
        # print()
        # print(f"parent_class: {parent_class}")
        node.clf = train_flat(clf, df, labels, child_class_column)
    
    # recursive step
    for child_class in unique_labels: # todo: parallelize
        child_node = Node(node, child_class)
        train_hyr(clf, df, labels, child_node, child_class, child_class_column)
        node.children[child_class] = child_node


In [10]:
## hyr predicting function
## given the hyr tree and an input, predict the cluster

# recursive
def predict_hyr(node, X_test):
    #### Printing
    spaces = 1
    it_node = node
    while it_node.parent is not None:
        it_node = it_node.parent
        spaces += 2
    # print((spaces*"--") + f"{node.class_name if node.parent is not None else 'Root' }")
    ####
    
    # the tree goes further down, but there is only 1 subclass and thus no further classifier needs to be executed
    if node.clf is None:
        child_node = list(node.children.values())[0]
        if child_node.class_name != "":
            name = child_node.class_name
        elif node.class_name != "":
            name = node.class_name
        else:
            name = node.parent.class_name
        y_test = pd.DataFrame(index=X_test.index, columns=[0]).fillna(name)
        #print(child_node.class_name)
    else:
        y_test = pd.DataFrame(node.clf.predict(X_test))
        y_test.index = X_test.index # keep original indices
    
    # we are in a leaf when the children dont have any children themselves
    # (We dont need to call predict on a child if they wont be able to futher classify to their children
    if list(node.children.values())[0].children == {} or len(list(node.children.keys())) == 1:
        return y_test
    else:
        # the children do have a clf to further classify, so further classify
        predictions = []
        for label, child_node in node.children.items():
            new_X_test = X_test[y_test[0] == label]
            predictions.append(predict_hyr(child_node, new_X_test))
        return pd.concat(predictions)
    
    

In [11]:
def hyr(clf, X, y, on_label=None, folds=2, folds_random_state=1337):
# use this to split dataset in 2 parts, test and train
# folds = 5
# folds_random_state = 1337
# y = labels.fillna("")
# X = df
# clf = svm.LinearSVC(max_iter=2000)
# df.index = labels.index

    skf = StratifiedKFold(n_splits=folds, random_state=folds_random_state, shuffle=folds_random_state is not None)

    k = 0
    accs = pd.DataFrame()
    y_flat = flatify(y)
    for train_index, test_index in skf.split(X, y_flat):

        # get train and test set
        X_train, X_test = X.take(train_index), X.take(test_index)
        y_train, y_test = y.take(train_index), y.take(test_index)

        # train the flat classifier
        print("Start training")
        root = Node(None, "")
        train_hyr(clf, X_train, y_train, root)

        print(root)
        # predicting
        print("Start predicting")
        y_pred_hyr = predict_hyr(root, X_test).sort_index(ascending=True) # sort data to calculate metrics

        # metrics
        accs = accs.append(calc_metrics(flatify(y_test), y_pred_hyr[0], f"../results/hyr_clf_f1-{k}.csv"))
        k += 1
        # break

    return accs.mean()

In [12]:
%%time
df.index = labels.index
accs = hyr(svm.LinearSVC(max_iter=4000), df, labels.fillna(""))

print("\nAVERAGE ACCURACY AND F1 SCORES:")
print(accs)

Start training




Root
Start predicting
accuracy: 0.6693347193347193
F1 micro-average: 0.6693347193347193
F1 macro-average: 0.4811042365138301
F1 weighted-average: 0.6691812824523582

Start training




Root
Start predicting
accuracy: 0.6566528066528067
F1 micro-average: 0.6566528066528067
F1 macro-average: 0.46832613666563266
F1 weighted-average: 0.6588303452197658


AVERAGE ACCURACY AND F1 SCORES:
accuracy               0.662994
F1 micro-average       0.662994
F1 macro-average       0.474715
F1 weighted-average    0.664006
dtype: float64
CPU times: user 11min 7s, sys: 1min 18s, total: 12min 26s
Wall time: 11min 47s


In [10]:
flatify(y_test)

0                Neutrophils
3                    exhaust
7                   Alveolar
16                Unspec-CD8
19                 recruited
                ...         
19258              Undefined
19260                Cytotox
19268                 naive1
19279            Neutrophils
19280    recruited-activated
Length: 3848, dtype: object

In [11]:
calc_metrics(flatify(y_test), y_pred_hyr)

ValueError: Found input variables with inconsistent numbers of samples: [3848, 832]

In [27]:
y_pred_hyr = predict_hyr(root, X_test).sort_index(ascending=True) # sort data to calculate metrics

    # metrics
accs = accs.append(calc_metrics(flatify(y_test), y_pred_hyr[0], f"../results/hyr_clf_f1-{k}.csv"))
    

accuracy: 0.6673596673596673
F1 micro-average: 0.6673596673596673
F1 macro-average: 0.46389954870517325
F1 weighted-average: 0.669690010943891



In [28]:
y_pred_hyr

Unnamed: 0,0
0,Neutrophils
3,exhaust
7,Alveolar
16,Unspec-CD8
19,recruited
...,...
19258,Neutrophils
19260,exhaust
19268,naive1
19279,Neutrophils


In [14]:
flatify(y_test)[31]

'CD4'

In [13]:
y_pred_hyr

Unnamed: 0,0
31,
74,
117,Unspec-CD4-1
139,
172,naive1
...,...
19073,naive2
19115,naive1
19117,Unspec-CD4-1
19123,
