In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import datasets, neighbors, metrics, tree, svm, preprocessing, model_selection, ensemble
from sklearn.base import clone as sklearn_clone
from sklearn.model_selection import StratifiedKFold
from pprint import pprint

In [2]:
%%time
df = pd.read_pickle("../data/Lauren/bam_df.pkl")
labels = pd.read_pickle("../data/Lauren/bam_labels.pkl")

CPU times: user 0 ns, sys: 2.12 s, total: 2.12 s
Wall time: 2.16 s


In [3]:
def calc_metrics(y_test, y_pred, f1_file_name=None):
    unique_labels_df = pd.DataFrame(pd.Series(y_test).unique())
    f1 = pd.DataFrame(metrics.f1_score(y_test, y_pred, average=None, labels = unique_labels_df[0]))
    acc = metrics.accuracy_score(y_test, y_pred)

    print(f"accuracy: {acc}")
    f1_micro = metrics.f1_score(y_test, y_pred, average='micro')
    f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
    f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')
    print(f"F1 micro-average: {f1_micro}")
    print(f"F1 macro-average: {f1_macro}")
    print(f"F1 weighted-average: {f1_weighted}")
    print()
    
    # seperate f1 score for each label
    f1_labeled = pd.concat([unique_labels_df[0], f1[0]], axis=1, keys=['class', 'f1_per_class'])
    # print(f1_labeled)
    
    # save seperate f1 scores
    if f1_file_name is not None:
        f1_labeled.to_csv(f1_file_name, index=False)
    
    return pd.DataFrame([[acc, f1_micro, f1_macro, f1_weighted]], columns=["accuracy", "F1 micro-average", "F1 macro-average", "F1 weighted-average"])
    

In [4]:
def flatify(multi_y):
    try:
        if multi_y.shape[1] == 1:
            return multi_y
    except:
        return multi_y

    def most_specific_class(row):
        for i in range(len(row)):
            if row.iloc[i] is None or row.iloc[i] == "":
                break
        return row.iloc[i-1]
    return multi_y.apply(lambda row: most_specific_class(row), axis=1)


In [5]:
# train a flat multi label classifier
# df: X
# labels: multiple columns, one of which will be y
# class_column_name: the name of the column in labels that will be y
def train_flat(clf, df, labels, class_column_name): 
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
     
    X_train, y_train = df, labels
    # copy the model, important for hierarchical
    clf = sklearn_clone(clf)
    clf.fit(X_train, y_train)
    
    return clf

In [6]:
def floatify(y):
    y = pd.Series(y)
    unique_labels = pd.Series(y).unique()
    y_float = y.apply(lambda row: pd.Index(unique_labels).get_loc(row) + 1) # no zeroes as values
    return y_float
    

In [7]:
pprint(floatify(flatify(labels)))
flatify(labels)


0         1
1         2
2         2
3         1
4         2
         ..
12827    11
12828     4
12829     6
12830    11
12831     3
Length: 12781, dtype: int64


0          Vip
1        Lamp5
2        Lamp5
3          Vip
4        Lamp5
         ...  
12827    L5 PT
12828     Sncg
12829    Pvalb
12830    L5 PT
12831      Sst
Length: 12781, dtype: object

In [8]:
def add_prev_layer_info(X_old, prev_y, i):
    prev_y_float = floatify(prev_y)
    prev_y_float.index = level_X_train.index
    X_old[f"Level_{i-1}_prediction"] = prev_y_float
    

In [10]:
depth = len(labels.columns)
# train a clf for each level in the tree
folds = 5
folds_random_state = 1337
y = labels
X = df
clf = svm.LinearSVC(max_iter=2000)

skf = StratifiedKFold(n_splits=folds, random_state=folds_random_state, shuffle=folds_random_state is not None)

k = 0
accs = pd.DataFrame()
y_flat = flatify(y)
for train_index, test_index in skf.split(X, y_flat):

    # get train and test set
    X_train, X_test = X.take(train_index), X.take(test_index)
    y_train, y_test = y.take(train_index), y.take(test_index)

    # divide train set up in 'depth' parts
    skf_train = StratifiedKFold(n_splits=depth, random_state=folds_random_state, shuffle=folds_random_state is not None)
    # get the indeces of the training data for the clf of each layer 
    level_indices = []
    for level_train_index, level_test_index in skf_train.split(X_train, flatify(y_train)):
        level_indices.append(level_train_index)
    
    level_clfs = []
    y_pred_of_prev_clf = None
    for i in range(depth):
        print(f"Start training of level {i}")
        # train the clf of the ith layer
        indices = level_indices[i]
        level_X_train, level_y_train = X_train.take(indices), y_train.take(indices)
        
        for j in range(i):
            clf_j = level_clfs[j]
            y_j = pd.Series(clf_j.predict(level_X_train))
            y_j.index = level_X_train.index
            level_X_train[f"Level_{j}_prediction"] = floatify(y_j)
        
        level_clf = train_flat(clf, level_X_train, level_y_train, level_y_train.columns[i])
        level_clfs.append(level_clf)
        
        
            
        # if not the first round, add column(s) to level_X_train that have the result of the previous layer
        # if i != 0:
        #     X_next = X_train.take(level_indices[i+1])
        #     y_j = None
        #     for j in range(i):
        #         y_j = level_clfs[j].predict(X_next)
        #         add_prev_layer_info(X_next, y_j, j)
        #         add_prev_layer_info(level_X_train, y_pred_of_prev_clf, i)
            # pprint(level_X_train.shape)
            # y_pred_of_prev_clf_float = floatify(y_pred_of_prev_clf)
            # y_pred_of_prev_clf_float.index = level_X_train.index
            # level_X_train[f"Level_{i-1}_prediction"] = y_pred_of_prev_clf_float
            # pprint(level_X_train.shape)
        
        # level_clf = train_flat(clf, level_X_train, level_y_train, level_y_train.columns[i])
        # level_clfs.append(level_clf)
        
        # TODO: the X values given here dont have the prediction of the previous layer
        # predict
#         X_next = X_train.take(level_indices[i+1])
#         y_j
#         for j in range(i):
#             y_j = level_clfs[j].predict(X_next)
#             add_prev_layer_info(X_next, y_j, j)
        
#         y_pred_of_prev_clf = level_clf.predict(X_next)
    
    # metrics
    break
    # accs = accs.append(calc_metrics(y_test[on_label], y_pred_hyr[0], f"../results/hyr_clf_f1-{k}.csv"))
    # k += 1

accs.mean()


Start training of level 0
Start training of level 1
Start training of level 2


Series([], dtype: float64)

In [12]:
y_j = None
clf_j = None
for j in range(3):
    clf_j = level_clfs[j]
    y_j = pd.Series(clf_j.predict(X_test))
    y_j.index = X_test.index
    X_test[f"Level_{j}_prediction"] = floatify(y_j)

y_pred = y_j
        

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [18]:
pprint(flatify(y_test))
y_pred

2        Lamp5
4        Lamp5
6         Sncg
7        Lamp5
8          Vip
         ...  
12811    L6 IT
12813      Sst
12822     Sncg
12826      Sst
12829    Pvalb
Length: 2557, dtype: object


2                        Lamp5 Lsp1
4                        Lamp5 Lsp1
6                    Sncg Vip Itih5
7                       Lamp5 Krt73
8                  Vip Igfbp6 Car10
                    ...            
12811    L6 IT VISp Col23a1 Adamts2
12813              Sst Crhr2 Efemp1
12822                    Sncg Gpr50
12826               Sst Hpse Sema3c
12829             Pvalb Gpr149 Islr
Length: 2557, dtype: object

In [34]:
pprint(level_X_train.shape)
floatify(y_pred_of_prev_clf)
floatified = floatify(y_pred_of_prev_clf)
floatified.index = level_X_train.index
level_X_train["TEST"] = floatified
level_X_train

(6816, 42627)


Unnamed: 0,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010B08Rik,0610010F05Rik,0610010K14Rik,...,Zyg11a,Zyg11b,Zyx,Zzef1,Zzz3,a,l7Rn6,n-R5s136,Level_0_prediction,TEST
0,0,0,79,145,1,46,123,0,0,33,...,0,78,0,36,1,0,190,0,1.0,1
1,0,0,123,178,26,23,2,0,337,318,...,0,69,0,78,29,0,179,0,1.0,1
5,0,0,266,111,0,0,0,0,0,25,...,4,88,202,2,140,0,239,0,1.0,1
9,0,0,121,114,88,0,81,0,332,35,...,0,168,0,0,113,0,261,0,1.0,1
10,0,0,181,62,0,0,0,0,13,50,...,0,180,90,88,1,0,59,0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,0,0,85,47,0,0,5,0,136,43,...,0,79,104,0,39,0,96,0,,2
12827,0,0,45,82,0,10,0,0,23,64,...,0,115,0,25,0,0,143,0,,2
12828,0,0,88,79,0,0,1,0,95,52,...,0,92,0,1,26,0,94,0,,1
12830,0,0,187,52,0,8,4,0,8,67,...,0,54,67,0,70,0,133,0,,2


In [None]:
pred_y = level_clfs[0].predict(X_test)
calc_metrics(y_test[y_test.columns[0]], pred_y)

pred_y = level_clfs[1].predict(X_test)
calc_metrics(y_test[y_test.columns[1]], pred_y)

pred_y = level_clfs[2].predict(X_test)
calc_metrics(y_test[y_test.columns[2]], pred_y)

accuracy: 1.0
F1 micro-average: 1.0
F1 macro-average: 1.0
F1 weighted-average: 1.0

accuracy: 0.9945248337895972
F1 micro-average: 0.9945248337895972
F1 macro-average: 0.9629827412174613
F1 weighted-average: 0.9942474398624058

accuracy: 0.9018380915134924
F1 micro-average: 0.9018380915134924
F1 macro-average: 0.8341886967204816
F1 weighted-average: 0.8996741027632947



Unnamed: 0,accuracy,F1 micro-average,F1 macro-average,F1 weighted-average
0,0.901838,0.901838,0.834189,0.899674
