In [66]:
import pandas as pd

import os
from glob import glob
import pickle
from sklearn.metrics import f1_score, balanced_accuracy_score, precision_score, recall_score, confusion_matrix

In [67]:
def get_results(path, dataset, seed):

    dataset_path = f'../data/{dataset}.csv' if dataset=='expert' else f'../data/{dataset}_{seed}.csv'
    df = pd.read_csv(dataset_path)
    
    with open(path, 'rb') as f:
        res_vals = pickle.load(f)
        
    df.loc[res_vals['idx'], 'labels'] = res_vals['labels']
    df.loc[res_vals['idx'], 'predictions'] = res_vals['predictions']
    df.loc[res_vals['idx'], 'neg_prob'] = res_vals['probabilities'][:,0]
    df.loc[res_vals['idx'], 'pos_prob'] = res_vals['probabilities'][:,1]
    
    if dataset=='expert':
        df['labels'] = (df['labels'].astype(bool) & ~df['Konsensus Target 1'].isna()).astype(float)

    return df

In [68]:
def calculate_metrics(predictions, labels):
    
    res = []

    res.append(balanced_accuracy_score(predictions, labels))
    res.append(f1_score(predictions, labels, average='binary'))
    res.append(precision_score(predictions, labels, average='binary'))
    res.append(recall_score(predictions, labels, average='binary'))
    res.append(f1_score(1 - predictions, 1 - labels, average='binary'))

    return res

In [69]:
directory = '../data/outputs'

In [70]:
df_results = pd.DataFrame(columns=['model_name', 'optimizer', 'class_weights', 'batch_size', 'seed', 'dataset', 'balanced_accuracy', 'f1', 'precision', 'recall', 'f1_neg'])
for subdir, dirs, files in os.walk(directory):
    for file in files:
        try:
            info = subdir.split('/')
            model_name = info[3]
            optimizer = info[4]
            class_weights = info[5][-1] == '1'
            batch_size = int(info[6][-2:])
            seed = int(info[7][-2:])
            dataset = file[:-12]
            df = get_results(os.path.join(subdir, file), dataset, seed)
            params = [model_name, optimizer, class_weights, batch_size, seed, dataset]
            metrics = calculate_metrics(df['predictions'], df['labels'])
            df_results.loc[len(df_results)] = params + metrics
        except:
            pass



In [71]:
df_results.sort_values(by='f1', ascending=False)

Unnamed: 0,model_name,optimizer,class_weights,batch_size,seed,dataset,balanced_accuracy,f1,precision,recall,f1_neg
3,e5,adamw,True,16,42,expert,0.760465,0.677419,0.709459,0.648148,0.855072
1,e5,adamw,True,16,43,expert,0.760000,0.664430,0.668919,0.660000,0.857550
51,e5_lora_nli_all,adamw,True,16,43,expert,0.752870,0.655518,0.662162,0.649007,0.853067
53,e5_lora_nli_all,adamw,True,16,44,expert,0.750476,0.651007,0.655405,0.646667,0.851852
5,e5,adamw,True,16,44,expert,0.750484,0.646259,0.641892,0.650685,0.852691
...,...,...,...,...,...,...,...,...,...,...,...
64,e5no_train,adamw,True,16,44,test,0.178134,0.302400,1.000000,0.178134,0.000000
63,e5no_train,adamw,True,16,43,expert,0.517904,0.273438,0.236486,0.324074,0.750000
61,e5no_train,adamw,True,16,42,expert,0.552068,0.207921,0.141892,0.388889,0.799499
60,e5no_train,adamw,True,16,42,test,0.527567,0.197960,0.176360,0.225590,0.848709


In [72]:
df = df_results.groupby(['model_name', 'optimizer', 'class_weights', 'batch_size', 'dataset']).agg(['mean', 'std']).drop('seed', axis=1).sort_values(by=('f1', 'mean'), ascending=False)
df.to_csv('../data/experiment_results.csv')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,balanced_accuracy,balanced_accuracy,f1,f1,precision,precision,recall,recall,f1_neg,f1_neg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
model_name,optimizer,class_weights,batch_size,dataset,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
e5,adamw,True,16,expert,0.756983,0.005633,0.662702,0.015652,0.673423,0.034008,0.652944,0.006241,0.855105,0.002429
e5_lora_nli_all,adamw,True,16,expert,0.749664,0.00368,0.64866,0.008284,0.650901,0.014065,0.646503,0.002589,0.851592,0.00162
e5_lora_nli,adamw,True,16,expert,0.749213,0.007365,0.643199,0.002336,0.637387,0.015604,0.649695,0.017795,0.851845,0.005916
e5_nli_all,adamw,True,16,test,0.732539,0.000528,0.629306,0.000954,0.823416,0.003071,0.509258,0.001068,0.88734,0.000469
e5_nli,adamw,True,16,expert,0.733405,0.013359,0.628909,0.016868,0.637387,0.020642,0.620996,0.021944,0.840921,0.008852
e5_nli,adamw,True,16,test,0.732655,0.003049,0.628682,0.005161,0.817133,0.005402,0.510866,0.004707,0.888123,0.001786
e5_nli_all,adamw,True,16,expert,0.728035,0.004125,0.623191,0.003573,0.635135,0.013514,0.611957,0.010366,0.83721,0.004208
e5,adamw,True,16,test,0.726756,0.003395,0.622947,0.005132,0.845898,0.004862,0.493044,0.007,0.879669,0.003309
roberta,adamw,True,16,expert,0.734335,0.00779,0.622677,0.012752,0.617117,0.017004,0.628421,0.010305,0.843399,0.004369
roberta,adamw,True,16,test,0.723662,0.002738,0.618079,0.004483,0.844642,0.003333,0.48736,0.004686,0.877106,0.002119


In [74]:
df_results.groupby(['model_name', 'optimizer', 'class_weights', 'batch_size', 'dataset']).agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,seed,balanced_accuracy,f1,precision,recall,f1_neg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,count,count,count,count,count,count
model_name,optimizer,class_weights,batch_size,dataset,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
bert,adamw,True,16,expert,3,3,3,3,3,3
bert,adamw,True,16,test,3,3,3,3,3,3
bert_lora,adamw,True,16,expert,3,3,3,3,3,3
bert_lora,adamw,True,16,test,3,3,3,3,3,3
e5,adamw,True,16,expert,3,3,3,3,3,3
e5,adamw,True,16,test,3,3,3,3,3,3
e5_lora,adamw,True,16,expert,3,3,3,3,3,3
e5_lora,adamw,True,16,test,3,3,3,3,3,3
e5_lora_nli,adamw,True,16,expert,3,3,3,3,3,3
e5_lora_nli,adamw,True,16,test,3,3,3,3,3,3
