In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import datasets, neighbors, metrics, tree, svm, preprocessing, model_selection, ensemble
from sklearn.model_selection import StratifiedKFold
from pprint import pprint


In [2]:
#%%time
#
## load labels
#labels = pd.read_csv('../data/Lauren/Labels.csv')
##labels.head() # to display the first 5 lines of loaded data
#
## load data
#df = pd.read_csv('../data/Lauren/500_PBMC_3p_LT_Chromium_X_metrics_summary.csv') # takes about 5min


In [3]:
#%%time
## filter out certain data
def gereral_data_filter(df, labels, filter_on, amount_higher_than):
    ## Filter info
    classes_count = labels.groupby(filter_on).count()
    classes_to_keep = list(classes_count[classes_count[classes_count.columns[-1]] >= amount_higher_than].index)
    keep_indices = labels[filter_on].isin(classes_to_keep)
    
    ## delete entries part of class that's too small, remove names column
    return (df[keep_indices].drop(columns=["Unnamed: 0"]), labels[keep_indices])

#df, labels = gereral_data_filter(df, labels, "cluster", 10)

In [4]:
%%time
df = pd.read_pickle("../data/Lauren/df.pkl")
labels = pd.read_pickle("../data/Lauren/labels.pkl")

CPU times: user 16 ms, sys: 2.48 s, total: 2.5 s
Wall time: 2.52 s


In [5]:
def train_test_linear_nn(df, labels, class_column_name): # todo: give linear classifier as argument
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
    
    # use this to split dataset in 2 parts, test and train
    skf = StratifiedKFold(n_splits=2, random_state=1337, shuffle=True)
    
    for train_index, test_index in skf.split(df, labels):
        #test_index, train_index = train_index, test_index 
        # get train and test set
        X_train, X_test = df.take(train_index), df.take(test_index)
        y_train, y_test = labels[train_index], labels[test_index]
        
        # 1vRest training
        print("TRAIN:", train_index, "TEST:", test_index)
        print("Start fitting")
        lin_clf = svm.LinearSVC()
        lin_clf.fit(X_train, y_train)
        
        # predicting
        print("Start predicting")
        y_pred_lin_clf = lin_clf.predict(X_test)
        
        # metrics
        #print("Calculating metrics")
        
        print(f"F1 macro-average: {metrics.f1_score(y_test, y_pred_lin_clf, average='macro')}")
        print(f"F1 weighted-average: {metrics.f1_score(y_test, y_pred_lin_clf, average='weighted')}")
        f1 = metrics.f1_score(y_test, y_pred_lin_clf, average=None)
        print(f"accuracy: {metrics.accuracy_score(y_test, y_pred_lin_clf)}")
        
        unique_labels_df = pd.DataFrame(pd.Series(y_test).unique())
        f1_df = pd.DataFrame(f1)
        print(pd.concat([unique_labels_df, f1_df], axis=1, keys=['dcluster', 'f1_per_dcluster']))
        
        # just do 1 iteration
        return (lin_clf, f1)
    
    
#drop_columns = filter(lambda col: col != "Class" , labels.columns)
#list(drop_columns)

In [8]:
def train_linear_nn(df, labels, class_column_name): # todo: give linear classifier as argument
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
     
    X_train, y_train = df, labels
        
    # 1vRest training
    print(f"Start training {class_column_name} entries with multiclass output: {pd.Series(y_train).unique()}")
    lin_clf = svm.LinearSVC()
    lin_clf.fit(X_train, y_train)
    
    return lin_clf
    

In [9]:
%%time
# as a test, train the first neural network to divide the data into Classes
(clf, all_f1) = train_test_linear_nn(df, labels, "Class")
all_f1

NameError: name 'train_test_linear_nn' is not defined

In [10]:
# make tree structure
class Node:
    def __init__(self, parent, class_name):
        self.parent = parent
        self.class_name = class_name
        
        self.clf = None
        self.children = dict() # dict die resultaat van clf linkt aan een nieuwe node (met clf)
        
    def __str__(self):
        if self.parent is None:
            return "Root"
        return f"(class_name: {self.class_name}, parent: {self.parent})"
    
    def __repr__(self):
        return self.__str__()

In [11]:
def filter_data_on_class_name(df, labels, class_name, class_column_name):
    keep_indices = labels[class_column_name] == class_name
    return (df[keep_indices], labels[keep_indices])

def train_hyr_nn(df, labels, node, parent_class=None, parent_class_column=None):
    # train neural net to classify input in the child classes
    
    # get the child_class_column
    if (parent_class is None or parent_class_column is None):
        child_class_column = labels.columns[0]
    else:
        # make data smaller: remove all entries that do not belong to the parent_class
        
        df, labels = filter_data_on_class_name(df, labels, parent_class, parent_class_column)
        
        # get child_class_column
        child_class_column_index = list(labels.columns).index(parent_class_column) + 1
        if child_class_column_index >= len(labels.columns):
            # we are at in a leaf of the hyr tree, there are no further child classes
            return None
        child_class_column = labels.columns[child_class_column_index]
    
    
    # neural net that further classifies entries
    unique_labels = pd.Series(labels[child_class_column]).unique()
    if len(unique_labels) == 1:
        # the subclass is the same as the parent class
        node.clf = None
    else:
        print()
        print(f"parent_class: {parent_class}")
        node.clf = train_linear_nn(df, labels, child_class_column)
    
    # recursive step
    for child_class in unique_labels: # todo: parallelize
        child_node = Node(node, child_class)
        train_hyr_nn(df, labels, child_node, child_class, child_class_column)
        node.children[child_class] = child_node


In [19]:
%%time
# get part of data where all cluster types are represented

drop_columns = list(filter(lambda col: col != "cluster" , labels.columns))
dclusters = labels.drop(columns = drop_columns).values.ravel()

# only use part (1/5) of data for training and 1/5th for testing
trained = False
skf = StratifiedKFold(n_splits=5, random_state=1337, shuffle=True)
for train_index, test_index in skf.split(df, dclusters):
    if not trained:
        print("TRAINING")
        root = Node(None, "")
        train_hyr_nn(df.take(test_index), labels.take(test_index), root)
        trained = True
    else: 
        print("SETTING TEST SET")
        X_test_set = df.take(test_index)
        y_test_set = labels.take(test_index)
        break

root

TRAINING

parent_class: None
Start training Class entries with multiclass output: ['GABAergic' 'Glutamatergic' 'Non-Neuronal']

parent_class: GABAergic
Start training Subclass entries with multiclass output: ['Lamp5' 'Sst' 'Vip' 'Sncg' 'Serpinf1' 'Pvalb']

parent_class: Lamp5
Start training cluster entries with multiclass output: ['Lamp5 Lsp1' 'Lamp5 Plch2 Dock5' 'Lamp5 Fam19a1 Tmem182' 'Lamp5 Krt73'
 'Lamp5 Ntn1 Npy2r' 'Lamp5 Lhx6' 'Lamp5 Fam19a1 Pax6']

parent_class: Sst
Start training cluster entries with multiclass output: ['Sst Tac1 Tacr3' 'Sst Hpse Sema3c' 'Sst Mme Fam114a1' 'Sst Nr2f2 Necab1'
 'Sst Myh8 Etv1 ' 'Sst Crh 4930553C11Rik ' 'Sst Tac2 Tacstd2'
 'Sst Rxfp1 Prdm8' 'Sst Chodl' 'Sst Chrna2 Ptgdr' 'Sst Esm1'
 'Sst Calb2 Pdlim5' 'Sst Myh8 Fibin' 'Sst Tac1 Htr1d' 'Sst Chrna2 Glra3'
 'Sst Hpse Cbln4' 'Sst Calb2 Necab1' 'Sst Rxfp1 Eya1' 'Sst Tac2 Myh4'
 'Sst Crhr2 Efemp1' 'Sst Nts']

parent_class: Vip
Start training cluster entries with multiclass output: ['Vip Igfbp6 Car10' 'V




parent_class: L5 IT
Start training cluster entries with multiclass output: ['L5 IT VISp Hsd11b1 Endou' 'L5 IT VISp Batf3' 'L5 IT VISp Col27a1'
 'L5 IT VISp Col6a1 Fezf2' 'L5 IT VISp Whrn Tox2']

parent_class: NP
Start training cluster entries with multiclass output: ['L5 NP VISp Trhr Met' 'L5 NP VISp Trhr Cpne7']

parent_class: L6b
Start training cluster entries with multiclass output: ['L6b P2ry12' 'L6b VISp Col8a1 Rxfp1' 'L6b VISp Mup5'
 'L6b VISp Col8a1 Rprm' 'L6b VISp Crh']
SETTING TEST SET
CPU times: user 1min 26s, sys: 10.5 s, total: 1min 37s
Wall time: 1min 37s


Root

In [13]:
root.children["Non-Neuronal"].children["Astro"].children

{'Astro Aqp4': (class_name: Astro Aqp4, parent: (class_name: Astro, parent: (class_name: Non-Neuronal, parent: Root)))}

In [59]:
# pprint(root.children)
# pprint(root.children["GABAergic"].children)
pprint(root.children["GABAergic"].children["Lamp5"].children)
pprint(root.children["GABAergic"].children["Lamp5"].clf)

{'Lamp5 Fam19a1 Pax6': (class_name: Lamp5 Fam19a1 Pax6, parent: (class_name: Lamp5, parent: (class_name: GABAergic, parent: Root))),
 'Lamp5 Fam19a1 Tmem182': (class_name: Lamp5 Fam19a1 Tmem182, parent: (class_name: Lamp5, parent: (class_name: GABAergic, parent: Root))),
 'Lamp5 Krt73': (class_name: Lamp5 Krt73, parent: (class_name: Lamp5, parent: (class_name: GABAergic, parent: Root))),
 'Lamp5 Lhx6': (class_name: Lamp5 Lhx6, parent: (class_name: Lamp5, parent: (class_name: GABAergic, parent: Root))),
 'Lamp5 Lsp1': (class_name: Lamp5 Lsp1, parent: (class_name: Lamp5, parent: (class_name: GABAergic, parent: Root))),
 'Lamp5 Ntn1 Npy2r': (class_name: Lamp5 Ntn1 Npy2r, parent: (class_name: Lamp5, parent: (class_name: GABAergic, parent: Root))),
 'Lamp5 Plch2 Dock5': (class_name: Lamp5 Plch2 Dock5, parent: (class_name: Lamp5, parent: (class_name: GABAergic, parent: Root)))}
LinearSVC()


In [16]:
pprint(root.children)

{'GABAergic': (class_name: GABAergic, parent: Root),
 'Glutamatergic': (class_name: Glutamatergic, parent: Root),
 'Non-Neuronal': (class_name: Non-Neuronal, parent: Root)}


In [None]:
%%time 
# given the hyr nn tree and an input, predict the cluster

def predict(node, X_test):
    y_test = node.clf.predict(X_test)
    
    # we are in a leaf
    if node.children.items()[0][1] is None:
        
    unique_labels = node.children.keys()
    
    for label in unique_labels:
        child_node = node.children[label]
        predict(child_node, X_test[y_test == label])
    
    

In [56]:
root.children["GABAergic"]


l1_res_df.index = X_test_set.index
X_test_set[l1_res_df[0] == "GABAergic"]

Unnamed: 0,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010B08Rik,0610010F05Rik,0610010K14Rik,...,Zxdb,Zxdc,Zyg11a,Zyg11b,Zyx,Zzef1,Zzz3,a,l7Rn6,n-R5s136
7,0,0,166,60,0,24,3,0,16,16,...,0,55,0,105,44,15,14,0,128,0
10,0,0,181,62,0,0,0,0,13,50,...,0,24,0,180,90,88,1,0,59,0
12,0,0,244,0,0,0,0,0,477,40,...,56,31,0,219,186,76,237,0,57,0
16,0,0,178,163,0,0,5,0,50,79,...,0,0,0,42,0,3,0,0,265,0
18,0,0,315,100,0,0,7,0,31,50,...,71,0,0,45,24,122,9,0,172,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12799,0,0,145,66,0,0,0,0,14,32,...,0,0,0,21,0,3,81,0,99,0
12803,0,0,137,141,0,5,7,0,152,63,...,0,46,0,45,85,43,0,0,57,0
12810,0,0,154,199,0,35,0,0,150,94,...,0,0,0,85,51,0,0,0,84,0
12819,0,0,84,156,0,0,24,0,61,73,...,0,13,0,23,0,17,0,0,151,0


In [60]:
# This is how 1 branch of the hyr tree, all the way down, should be trained

# Train on the whole dataset to divide in Class
l1_res_df = pd.DataFrame(root.clf.predict(X_test_set))
l1_res_df.index = X_test_set.index # set the same indexes before further filtering
pprint(l1_res_df) # aha! this keeps the old indices

# filter out all entries that gave Class "GABAergic" and train on this subset to determine Subclass
l2_input = X_test_set[l1_res_df[0] == "GABAergic"]
l2_res_df = pd.DataFrame(root.children["GABAergic"].clf.predict(l2_input))
l2_res_df.index = l2_input.index
pprint(l2_res_df) # aha! this keeps the old indices

# filter out all entries that gave Subclass "Lamp5" and finally train on cluster
l3_input = l2_input[l2_res_df[0] == "Lamp5"]
l3_res_df = pd.DataFrame(root.children["GABAergic"].children["Lamp5"].clf.predict(l3_input))
l3_res_df.index = l3_input.index
pprint(l3_res_df) # aha! this keeps the old indices

                   0
7          GABAergic
10         GABAergic
12         GABAergic
16         GABAergic
18         GABAergic
...              ...
12818  Glutamatergic
12819      GABAergic
12820      GABAergic
12824  Glutamatergic
12825  Glutamatergic

[2556 rows x 1 columns]
           0
7      Lamp5
10       Vip
12       Vip
16     Lamp5
18     Lamp5
...      ...
12799    Sst
12803    Sst
12810  Pvalb
12819    Sst
12820    Sst

[1129 rows x 1 columns]
                       0
7            Lamp5 Krt73
16            Lamp5 Lsp1
18            Lamp5 Lsp1
38      Lamp5 Ntn1 Npy2r
56            Lamp5 Lsp1
...                  ...
12471   Lamp5 Ntn1 Npy2r
12472   Lamp5 Ntn1 Npy2r
12475         Lamp5 Lsp1
12491  Lamp5 Plch2 Dock5
12495  Lamp5 Plch2 Dock5

[218 rows x 1 columns]
