In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn import datasets, neighbors, metrics, tree, svm, preprocessing, model_selection, ensemble
from sklearn.base import clone as sklearn_clone
from sklearn.model_selection import StratifiedKFold
from pprint import pprint

In [2]:
%%time
df = pd.read_pickle("../data/Lauren/bam_df.pkl")
labels = pd.read_pickle("../data/Lauren/bam_labels.pkl")

CPU times: user 17.2 ms, sys: 2.43 s, total: 2.45 s
Wall time: 2.57 s


In [3]:
def calc_metrics(y_test, y_pred, f1_file_name=None):
    unique_labels_df = pd.DataFrame(pd.Series(y_test).unique())
    f1 = pd.DataFrame(metrics.f1_score(y_test, y_pred, average=None, labels = unique_labels_df[0]))
    acc = metrics.accuracy_score(y_test, y_pred)

    print(f"accuracy: {acc}")
    f1_micro = metrics.f1_score(y_test, y_pred, average='micro')
    f1_macro = metrics.f1_score(y_test, y_pred, average='macro')
    f1_weighted = metrics.f1_score(y_test, y_pred, average='weighted')
    print(f"F1 micro-average: {f1_micro}")
    print(f"F1 macro-average: {f1_macro}")
    print(f"F1 weighted-average: {f1_weighted}")
    print()
    
    # seperate f1 score for each label
    f1_labeled = pd.concat([unique_labels_df[0], f1[0]], axis=1, keys=['class', 'f1_per_class'])
    # print(f1_labeled)
    
    # save seperate f1 scores
    if f1_file_name is not None:
        f1_labeled.to_csv(f1_file_name, index=False)
    
    return pd.DataFrame([[acc, f1_micro, f1_macro, f1_weighted]], columns=["accuracy", "F1 micro-average", "F1 macro-average", "F1 weighted-average"])
    

In [4]:
def flatify(multi_y):
    try:
        if multi_y.shape[1] == 1:
            return multi_y
    except:
        return multi_y

    def most_specific_class(row):
        for i in range(len(row)):
            if row.iloc[i] is None or row.iloc[i] == "":
                break
        return row.iloc[i-1]
    return multi_y.apply(lambda row: most_specific_class(row), axis=1)


In [5]:
# train a flat multi label classifier
# df: X
# labels: multiple columns, one of which will be y
# class_column_name: the name of the column in labels that will be y
def train_flat(clf, df, labels, class_column_name): 
    # only keep the needed column
    drop_columns = filter(lambda col: col != class_column_name , labels.columns)
    labels = labels.drop(columns = drop_columns).values.ravel()
     
    X_train, y_train = df, labels
    # copy the model, important for hierarchical
    clf = sklearn_clone(clf)
    clf.fit(X_train, y_train)
    
    return clf

In [None]:
depth = len(labels.columns)
# train a clf for each depth in the tree
folds = 5
folds_random_state = 1337
y = labels
X = df
clf = svm.LinearSVC(max_iter=2000)

skf = StratifiedKFold(n_splits=folds, random_state=folds_random_state, shuffle=folds_random_state is not None)

k = 0
accs = pd.DataFrame()
y_flat = flatify(y)
for train_index, test_index in skf.split(X, y_flat):

    # get train and test set
    X_train, X_test = X.take(train_index), X.take(test_index)
    y_train, y_test = y.take(train_index), y.take(test_index)

    # divide train set up in 'depth' parts
    skf_train = StratifiedKFold(n_splits=depth, random_state=folds_random_state, shuffle=folds_random_state is not None)
    # get the indeces of the training data for the clf of each layer 
    level_indices = []
    for level_train_index, level_test_index in skf_train.split(X_train, flatify(y_train)):
        level_indices.append(level_train_index)
    
    level_clfs = []
    for i in range(depth):
        print(f"Start training of level {i}")
        # train the clf of the ith layer
        indices = level_indices[i]
        level_X_train, level_y_train = X_train.take(indices), y_train.take(indices)
        
        level_clfs.append(train_flat(clf, level_X_train, level_y_train, level_y_train.columns[i]))
        

    
    # metrics
    break
    # accs = accs.append(calc_metrics(y_test[on_label], y_pred_hyr[0], f"../results/hyr_clf_f1-{k}.csv"))
    # k += 1

accs.mean()


Start training of level 0
Start training of level 1
Start training of level 2


NameError: name 'on_label' is not defined

In [None]:
level_clfs


[LinearSVC(max_iter=2000), LinearSVC(max_iter=2000), LinearSVC(max_iter=2000)]

In [15]:
pred_y = level_clfs[0].predict(X_test)
calc_metrics(y_test[y_test.columns[0]], pred_y)

pred_y = level_clfs[1].predict(X_test)
calc_metrics(y_test[y_test.columns[1]], pred_y)

pred_y = level_clfs[2].predict(X_test)
calc_metrics(y_test[y_test.columns[2]], pred_y)

accuracy: 1.0
F1 micro-average: 1.0
F1 macro-average: 1.0
F1 weighted-average: 1.0

accuracy: 0.9945248337895972
F1 micro-average: 0.9945248337895972
F1 macro-average: 0.9629827412174613
F1 weighted-average: 0.9942474398624058

accuracy: 0.9018380915134924
F1 micro-average: 0.9018380915134924
F1 macro-average: 0.8341886967204816
F1 weighted-average: 0.8996741027632947



Unnamed: 0,accuracy,F1 micro-average,F1 macro-average,F1 weighted-average
0,0.901838,0.901838,0.834189,0.899674
