In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import datasets, neighbors, metrics, tree, svm, preprocessing, model_selection, ensemble
from sklearn.model_selection import StratifiedKFold
from pprint import pprint


In [2]:
#%%time
#
## load labels
#labels = pd.read_csv('../data/Lauren/Labels.csv')
##labels.head() # to display the first 5 lines of loaded data
#
## load data
#df = pd.read_csv('../data/Lauren/500_PBMC_3p_LT_Chromium_X_metrics_summary.csv') # takes about 5min


In [3]:
#%%time
## filter out certain data
def gereral_data_filter(df, labels, filter_on, amount_higher_than):
    ## Filter info
    classes_count = labels.groupby(filter_on).count()
    classes_to_keep = list(classes_count[classes_count[classes_count.columns[-1]] >= amount_higher_than].index)
    keep_indices = labels[filter_on].isin(classes_to_keep)
    
    ## delete entries part of class that's too small, remove names column
    return (df[keep_indices].drop(columns=["Unnamed: 0"]), labels[keep_indices])

#df, labels = gereral_data_filter(df, labels, "cluster", 10)

In [4]:
%%time
df = pd.read_pickle("../data/Lauren/df.pkl")
labels = pd.read_pickle("../data/Lauren/labels.pkl")

CPU times: user 0 ns, sys: 1.71 s, total: 1.71 s
Wall time: 1.7 s


In [5]:
def train_test_linear_nn(df, labels, class_column_name): # todo: give linear classifier as argument
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
    
    # use this to split dataset in 2 parts, test and train
    skf = StratifiedKFold(n_splits=2, random_state=1337, shuffle=True)
    
    for train_index, test_index in skf.split(df, labels):
        #test_index, train_index = train_index, test_index 
        # get train and test set
        X_train, X_test = df.take(train_index), df.take(test_index)
        y_train, y_test = labels[train_index], labels[test_index]
        
        # 1vRest training
        print("TRAIN:", train_index, "TEST:", test_index)
        print("Start fitting")
        lin_clf = svm.LinearSVC()
        lin_clf.fit(X_train, y_train)
        
        # predicting
        print("Start predicting")
        y_pred_lin_clf = lin_clf.predict(X_test)
        
        # metrics
        #print("Calculating metrics")
        
        print(f"F1 macro-average: {metrics.f1_score(y_test, y_pred_lin_clf, average='macro')}")
        print(f"F1 weighted-average: {metrics.f1_score(y_test, y_pred_lin_clf, average='weighted')}")
        f1 = metrics.f1_score(y_test, y_pred_lin_clf, average=None)
        print(f"accuracy: {metrics.accuracy_score(y_test, y_pred_lin_clf)}")
        
        unique_labels_df = pd.DataFrame(pd.Series(y_test).unique())
        f1_df = pd.DataFrame(f1)
        print(pd.concat([unique_labels_df, f1_df], axis=1, keys=['dcluster', 'f1_per_dcluster']))
        
        # just do 1 iteration
        return (lin_clf, f1)
    
    
#drop_columns = filter(lambda col: col != "Class" , labels.columns)
#list(drop_columns)

In [6]:
def train_linear_nn(df, labels, class_column_name): # todo: give linear classifier as argument
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
     
    X_train, y_train = df, labels
        
    # 1vRest training
    print("Start training")
    lin_clf = svm.LinearSVC()
    lin_clf.fit(X_train, y_train)
    
    return lin_clf
    

In [5]:
%%time
# as a test, train the first neural network to divide the data into Classes
(clf, all_f1) = train_linear_nn(df, labels, "Class")
all_f1

TRAIN: [    0     2     3 ... 12774 12776 12780] TEST: [    1     4     5 ... 12777 12778 12779]
Start fitting
Start predicting
F1 macro-average: 1.0
F1 weighted-average: 1.0
accuracy: 1.0
        dcluster f1_per_dcluster
               0               0
0      GABAergic             1.0
1  Glutamatergic             1.0
2   Non-Neuronal             1.0
CPU times: user 19.5 s, sys: 22.8 s, total: 42.3 s
Wall time: 49.4 s


array([1., 1., 1.])

In [7]:
# make tree structure
class Node:
    def __init__(self, parent, clf, children):
        self.parent = parent
        self.clf = clf
        self.children = children # dict die resultaat van clf linkt aan een nieuwe node (met clf)

In [8]:
def filter_data_on_class_name(df, labels, class_name, class_column_name):
    keep_indices = labels[class_column_name] == class_name
    return (df[keep_indices], labels[keep_indices])

def train_hyr_nn(df, labels, node, parent_class=None, parent_class_column=None):
    # train neural net to classify input in the child classes
    
    # get the child_class_column
    if (parent_class is None or parent_class_column is None):
        child_class_column = labels.columns[0]
    else:
        # remove all entries that do not belong to the parent_class
        df, labels = filter_data_on_class_name(df, labels, parent_class, parent_class_column)
        
        # get child_class_column
        child_class_column_index = list(labels.columns).index(parent_class_column) + 1
        if child_class_column_index >= len(labels.columns):
            # we are at the root of the hyr tree, there are no further child classes
            return None
        child_class_column = labels.columns[child_class_column_index]
    
    # neural net that further classifies entries
    # make new node belonging to the clf that we train here
    # node = Node(parent_node, 
    #             train_linear_nn(df, labels, child_class_column), 
    #             dict())
    unique_labels = pd.Series(labels[child_class_column]).unique()
    if len(unique_labels) == 1:
        node.clf = None
        return
    
    node.clf = train_linear_nn(df, labels, child_class_column)
    
    # recursive step
    for child_class in unique_labels: # todo: parallelize
        child_node = Node(node, None, dict())
        train_hyr_nn(df, labels, child_node, child_class, child_class_column)
        if child_node.clf is not None:
            node.children[child_class] = child_node
        else:
            node.children[child_class] = None
            
    
    
    
    
    
#filter_child_class(df, labels, "Vip", "Subclass")[1].shape
# root = Node(None, None, dict())
# train_hyr_nn(df, labels, root)
# root
#print("---")
#train_hyr_nn(df, labels, Node(None, None, dict()), "GABAergic", "Class")
#print("---")
#train_hyr_nn(df, labels, Node(None, None, dict()), "Lamp5", "Subclass")
#print("---")
#train_hyr_nn(df, labels, Node(None, None, dict()), "Lamp5 Lsp1", "cluster") # returns without result

In [9]:
%%time
# get part of data where all cluster types are represented

drop_columns = filter(lambda col: col != "cluster" , labels.columns)
dclusters = labels.drop(columns = drop_columns).values.ravel()

# only use part (1/5) of data
skf = StratifiedKFold(n_splits=5, random_state=1337, shuffle=True)
for train_index, test_index in skf.split(df, dclusters):
    root = Node(None, None, dict())
    train_hyr_nn(df.take(test_index), labels.take(test_index), root)
    break

root

Start training
Start training
Start training
Start training
Start training




Start training
Start training
Start training
Start training
Start training
Start training
Start training
Start training
Start training
Start training
Start training
CPU times: user 1min 19s, sys: 3.17 s, total: 1min 22s
Wall time: 1min 22s


<__main__.Node at 0x7f481c5a6dc0>

In [20]:
pprint(root.children)
pprint(root.children["GABAergic"].children)
pprint(root.children["GABAergic"].children["Lamp5"].children)

{'GABAergic': <__main__.Node object at 0x7f481c59e880>,
 'Glutamatergic': <__main__.Node object at 0x7f481c59e580>,
 'Non-Neuronal': None}
{'Lamp5': <__main__.Node object at 0x7f46afce28b0>,
 'Pvalb': <__main__.Node object at 0x7f46afce2910>,
 'Serpinf1': <__main__.Node object at 0x7f46afce22b0>,
 'Sncg': <__main__.Node object at 0x7f46afce2640>,
 'Sst': <__main__.Node object at 0x7f46afce2ee0>,
 'Vip': <__main__.Node object at 0x7f47b573dee0>}
{'Lamp5 Fam19a1 Pax6': None,
 'Lamp5 Fam19a1 Tmem182': None,
 'Lamp5 Krt73': None,
 'Lamp5 Lhx6': None,
 'Lamp5 Lsp1': None,
 'Lamp5 Ntn1 Npy2r': None,
 'Lamp5 Plch2 Dock5': None}
