# Comparing hyr classification with flat classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import datasets, neighbors, metrics, tree, svm, preprocessing, model_selection, ensemble
from sklearn.model_selection import StratifiedKFold
from pprint import pprint


## Loading the data
This data has been preprocessed in pickle.ipynb and stored in .pkl files.
This makes for much faster loading

In [None]:
%%time
df = pd.read_pickle("../data/Lauren/df.pkl")
labels = pd.read_pickle("../data/Lauren/labels.pkl")

CPU times: user 0 ns, sys: 2.47 s, total: 2.47 s
Wall time: 2.49 s


## Metrics

In [7]:
def calc_metrics(y_test, y_pred, f1_file_name=None):
    unique_labels_df = pd.DataFrame(pd.Series(y_test).unique())
    f1 = pd.DataFrame(metrics.f1_score(y_test, y_pred, average=None, labels = unique_labels_df[0]))
    acc = metrics.accuracy_score(y_test, y_pred)

    print(f"accuracy: {acc}")
    print(f"F1 micro-average: {metrics.f1_score(y_test_set['cluster'], y_pred, average='micro')}")
    print(f"F1 macro-average: {metrics.f1_score(y_test_set['cluster'], y_pred, average='macro')}")
    print(f"F1 weighted-average: {metrics.f1_score(y_test_set['cluster'], y_pred, average='weighted')}")
    print()
    
    # seperate f1 score for each label
    f1_labeled = pd.concat([unique_labels_df[0], f1[0]], axis=1, keys=['class', 'f1_per_class'])
    print(f1_labeled)
    
    # save seperate f1 scores
    if f1_file_name is not None:
        f1_labeled.to_csv(f1_file_name, index=False)
    

## Flat classification

In [8]:
# train a linear multi label classifier
# df: X
# labels: multiple columns, one of which will be y
# class_column_name: the name of the column in labels that will be y
def train_linear_nn(df, labels, class_column_name): # todo: give linear classifier as argument
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
     
    X_train, y_train = df, labels
        
    # 1vRest training
    print(f"Start training {class_column_name} entries with multiclass output: {pd.Series(y_train).unique()}")
    lin_clf = svm.LinearSVC()
    lin_clf.fit(X_train, y_train)
    
    return lin_clf
    

In [20]:
%%time
## train on half the dataset, test on the other and calculate the metrics

# use this to split dataset in 2 parts, test and train
skf = StratifiedKFold(n_splits=2, random_state=1337, shuffle=True)

for train_index, test_index in skf.split(df, labels["cluster"]):

    # get train and test set
    X_train, X_test = df.take(train_index), df.take(test_index)
    y_train, y_test = labels.take(train_index), labels.take(test_index)

    # train the flat classifier
    print("Start training")
    flat_clf = train_linear_nn(X_train, y_train, "cluster")

    # predicting
    print("Start predicting")
    y_pred_flat_clf = flat_clf.predict(X_test)

    # metrics
    calc_metrics(y_test["cluster"], y_pred_flat_clf, "../results/flat_clf_f1.csv")
    
    break 

Start training
Start training cluster entries with multiclass output: ['Vip Arhgap36 Hmcn1' 'Vip Crispld2 Htr2c' 'Lamp5 Plch2 Dock5'
 'Sncg Vip Itih5' 'Vip Crispld2 Kcne4' 'Vip Lect1 Oxtr' 'Lamp5 Lsp1'
 'Vip Chat Htr1f' 'Vip Pygm C1ql1' 'Lamp5 Krt73' 'Pvalb Tpbg'
 'Lamp5 Fam19a1 Tmem182' 'Lamp5 Fam19a1 Pax6' 'Vip Igfbp6 Car10'
 'Lamp5 Ntn1 Npy2r' 'Vip Igfbp6 Pltp' 'Pvalb Reln Tac1' 'Sst Chrna2 Ptgdr'
 'Sst Hpse Cbln4' 'Sst Hpse Sema3c' 'Vip Igfbp4 Mab21l1' 'Pvalb Vipr2'
 'Sst Rxfp1 Prdm8' 'Sst Nr2f2 Necab1' 'Pvalb Calb1 Sst' 'Sst Chrna2 Glra3'
 'Sncg Gpr50' 'Pvalb Gabrg1' 'L6 CT VISp Nxph2 Wls' 'L6 CT VISp Ctxn3 Sla'
 'L6 CT VISp Krt80 Sla' 'L6 CT VISp Gpr139' 'L6 CT VISp Ctxn3 Brinp3'
 'L6b VISp Col8a1 Rxfp1' 'L6 IT VISp Penk Col27a1' 'L6 IT VISp Penk Fst'
 'L6 IT VISp Col23a1 Adamts2' 'Sst Crhr2 Efemp1' 'L2/3 IT VISp Adamts2'
 'L2/3 IT VISp Rrad' 'Sst Tac1 Tacr3' 'L2/3 IT VISp Agmat'
 'Sst Calb2 Pdlim5' 'Sst Rxfp1 Eya1' 'Lamp5 Lhx6' 'Sst Chodl'
 'Sst Tac1 Htr1d' 'Vip Ptprt Pkp2' 'Snc



Start predicting


NameError: name 'calc_metrix' is not defined

In [None]:
# make tree structure
class Node:
    def __init__(self, parent, class_name):
        self.parent = parent
        self.class_name = class_name
        
        self.clf = None
        self.children = dict() # dict die resultaat van clf linkt aan een nieuwe node (met clf)
        
    def __str__(self):
        if self.parent is None:
            return "Root"
        return f"(class_name: {self.class_name}, parent: {self.parent})"
    
    def __repr__(self):
        return self.__str__()

In [None]:
def filter_data_on_class_name(df, labels, class_name, class_column_name):
    keep_indices = labels[class_column_name] == class_name
    return (df[keep_indices], labels[keep_indices])

def train_hyr_nn(df, labels, node, parent_class=None, parent_class_column=None):
    # train neural net to classify input in the child classes
    
    # get the child_class_column
    if (parent_class is None or parent_class_column is None):
        child_class_column = labels.columns[0]
    else:
        # make data smaller: remove all entries that do not belong to the parent_class
        
        df, labels = filter_data_on_class_name(df, labels, parent_class, parent_class_column)
        
        # get child_class_column
        child_class_column_index = list(labels.columns).index(parent_class_column) + 1
        if child_class_column_index >= len(labels.columns):
            # we are at in a leaf of the hyr tree, there are no further child classes
            return None
        child_class_column = labels.columns[child_class_column_index]
    
    
    # neural net that further classifies entries
    unique_labels = pd.Series(labels[child_class_column]).unique()
    if len(unique_labels) == 1:
        # the subclass is the same as the parent class
        node.clf = None
    else:
        print()
        print(f"parent_class: {parent_class}")
        node.clf = train_linear_nn(df, labels, child_class_column)
    
    # recursive step
    for child_class in unique_labels: # todo: parallelize
        child_node = Node(node, child_class)
        train_hyr_nn(df, labels, child_node, child_class, child_class_column)
        node.children[child_class] = child_node


In [None]:
%%time
# get part of data where all cluster types are represented

drop_columns = list(filter(lambda col: col != "cluster" , labels.columns))
dclusters = labels.drop(columns = drop_columns).values.ravel()

# only use part (1/5) of data for training and 1/5th for testing
trained = False
skf = StratifiedKFold(n_splits=2, random_state=1337, shuffle=True)
for train_index, test_index in skf.split(df, dclusters):
    if not trained:
        print("TRAINING")
        root = Node(None, "")
        train_hyr_nn(df.take(test_index), labels.take(test_index), root)
        trained = True
    else: 
        print("SETTING TEST SET")
        X_test_set = df.take(test_index)
        y_test_set = labels.take(test_index)
        break

root

In [None]:
# given the hyr nn tree and an input, predict the cluster

# recursive
def predict(node, X_test):
    #### Printing
    spaces = 1
    it_node = node
    while it_node.parent is not None:
        it_node = it_node.parent
        spaces += 2
    print((spaces*"--") + f"{node.class_name if node.parent is not None else 'Root' }")
    ####
    
    # the tree goes further down, but there is only 1 subclass and thus no further classifier needs to be executed
    if node.clf is None:
        child_node = list(node.children.values())[0]
        y_test = pd.DataFrame(index=X_test.index, columns=[0]).fillna(child_node.class_name) 
        #print(child_node.class_name)
    else:
        y_test = pd.DataFrame(node.clf.predict(X_test))
        y_test.index = X_test.index # keep original indices
    
    # we are in a leaf when the children dont have any children themselves
    # (We dont need to call predict on a child if they wont be able to futher classify to their children
    if list(node.children.values())[0].children == {}:
        return y_test
    else:
        # the children do have a clf to further classify, so further classify
        predictions = []
        for label, child_node in node.children.items():
            new_X_test = X_test[y_test[0] == label]
            predictions.append(predict(child_node, new_X_test))
        return pd.concat(predictions)
    
    

In [None]:
%%time 
y_pred_hyr = predict(root, X_test_set).sort_index(ascending=True)

In [None]:
f1 = metrics.f1_score(y_test_set["cluster"], y_pred_hyr[0], average=None, labels = unique_labels_df[0])
acc = metrics.accuracy_score(y_test_set["cluster"], y_pred_hyr[0])

unique_labels_df = pd.DataFrame(pd.Series(y_test_set["cluster"]).unique())

f1_df = pd.DataFrame(f1)
f1_df_labeled = pd.concat([unique_labels_df, f1_df], axis=1, keys=['dcluster', 'f1_per_dcluster'])
print(f1_df_labeled)
print(f"acc: {acc}")
print()
print(f"F1 micro-average: {metrics.f1_score(y_test_set['cluster'], y_pred_hyr[0], average='micro')}")
print(f"F1 macro-average: {metrics.f1_score(y_test_set['cluster'], y_pred_hyr[0], average='macro')}")
print(f"F1 weighted-average: {metrics.f1_score(y_test_set['cluster'], y_pred_hyr[0], average='weighted')}")