In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import datasets, neighbors, metrics, tree, svm, preprocessing, model_selection, ensemble
from sklearn.model_selection import StratifiedKFold


In [None]:
#%%time
#
## load labels
#labels = pd.read_csv('../data/Lauren/Labels.csv')
##labels.head() # to display the first 5 lines of loaded data
#
## load data
#df = pd.read_csv('../data/Lauren/500_PBMC_3p_LT_Chromium_X_metrics_summary.csv') # takes about 5min


CPU times: user 4min 41s, sys: 6.47 s, total: 4min 47s
Wall time: 4min 50s


In [None]:
#%%time
## filter out certain data
def gereral_data_filter(df, labels, filter_on, amount_higher_than):
    ## Filter info
    classes_count = labels.groupby(filter_on).count()
    classes_to_keep = list(classes_count[classes_count[classes_count.columns[-1]] >= amount_higher_than].index)
    keep_indices = labels[filter_on].isin(classes_to_keep)
    
    ## delete entries part of class that's too small, remove names column
    return (df[keep_indices].drop(columns=["Unnamed: 0"]), labels[keep_indices])

#df, labels = gereral_data_filter(df, labels, "cluster", 10)

CPU times: user 1.68 s, sys: 18 s, total: 19.7 s
Wall time: 20.6 s


In [3]:
%%time
df = pd.read_pickle("../data/Lauren/df.pkl")
labels = pd.read_pickle("../data/Lauren/labels.pkl")

CPU times: user 0 ns, sys: 1.64 s, total: 1.64 s
Wall time: 1.64 s


In [4]:
def train_linear_nn(df, labels, class_column_name): # todo: give linear classifier as argument
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
    
    # use this to split dataset in 2 parts, test and train
    skf = StratifiedKFold(n_splits=2, random_state=1337, shuffle=True)
    
    for train_index, test_index in skf.split(df, labels):
        #test_index, train_index = train_index, test_index # train on 1/5 of the data, test on 4/5ths
        # get train and test set
        X_train, X_test = df.take(train_index), df.take(test_index)
        y_train, y_test = labels[train_index], labels[test_index]
        
        # 1vRest training
        print("TRAIN:", train_index, "TEST:", test_index)
        print("Start fitting")
        lin_clf = svm.LinearSVC()
        lin_clf.fit(X_train, y_train)
        
        # predicting
        print("Start predicting")
        y_pred_lin_clf = lin_clf.predict(X_test)
        
        # metrics
        #print("Calculating metrics")
        
        print(f"F1 macro-average: {metrics.f1_score(y_test, y_pred_lin_clf, average='macro')}")
        print(f"F1 weighted-average: {metrics.f1_score(y_test, y_pred_lin_clf, average='weighted')}")
        f1 = metrics.f1_score(y_test, y_pred_lin_clf, average=None)
        print(f"accuracy: {metrics.accuracy_score(y_test, y_pred_lin_clf)}")
        
        unique_labels_df = pd.DataFrame(pd.Series(y_test).unique())
        f1_df = pd.DataFrame(f1)
        print(pd.concat([unique_labels_df, f1_df], axis=1, keys=['dcluster', 'f1_per_dcluster']))
        
        # just do 1 iteration
        return (lin_clf, f1)
    
    
#drop_columns = filter(lambda col: col != "Class" , labels.columns)
#list(drop_columns)


In [5]:
%%time
# as a test, train the first neural network to divide the data into Classes
(clf, all_f1) = train_linear_nn(df, labels, "Class")
all_f1

TRAIN: [    0     2     3 ... 12774 12776 12780] TEST: [    1     4     5 ... 12777 12778 12779]
Start fitting
Start predicting
F1 macro-average: 1.0
F1 weighted-average: 1.0
accuracy: 1.0
        dcluster f1_per_dcluster
               0               0
0      GABAergic             1.0
1  Glutamatergic             1.0
2   Non-Neuronal             1.0
CPU times: user 19.5 s, sys: 22.8 s, total: 42.3 s
Wall time: 49.4 s


array([1., 1., 1.])

In [12]:
def filter_data_on_class_name(df, labels, class_name, class_column_name):
    keep_indices = labels[class_column_name] == class_name
    return (df[keep_indices], labels[keep_indices])

def train_hyr_nn(df, labels, parent_class, parent_class_column):
    # train neural net to classify input in the child classes
    # remove all entries that do not belong to the parent_class
    df, labels = filter_data_on_class_name(df, labels, parent_class, parent_class, parent_class_column)
    
    # get the child_class_column
    child_class_column = labels.columns[list(labels.columns).index(parent_class_column) + 1]
    
    # neural net that further classifies entries
    clf = train_linear_nn(df, labels, child_class_column)
    
    
#filter_child_class(df, labels, "Vip", "Subclass")[1].shape

(1554, 3)

In [7]:
#def train_hyr_nn(df, labels):
depth = len(labels.columns)
for column in labels.columns:
    


3