In [1]:
import pandas as pd
from copy import deepcopy
import re
from ast import literal_eval
import numpy as np
from collections import defaultdict

In [2]:
df = pd.read_json('overview_of_results.jsonl',lines=True)
df = df.drop_duplicates(['name_or_path','predict/_accuracy_normalized','seed','finetuning_task'])
df_copy = deepcopy(df)

In [3]:
df[(df.finetuning_task=='all_languages_covid19_emergency_event') & (df.name_or_path=='microsoft/mdeberta-v3-base') & (df.seed==4)]['en_predict/_macro-f1']

523    0.294781
Name: en_predict/_macro-f1, dtype: float64

In [4]:
df_final = list()
for r in df.to_dict(orient='records'):
    r_copy = deepcopy(r)
    for k in r.keys():
        if k.endswith('_score'):
            r_copy[re.sub('_score','',k)]=r[k]
    df_final.append(r_copy)

In [5]:
df_final = pd.DataFrame(df_final)

In [6]:
df_final[['eval_weighted-recall_score','eval_weighted-recall']].head()

Unnamed: 0,eval_weighted-recall_score,eval_weighted-recall
0,0.725265,0.725265
1,0.725265,0.725265
2,0.730943,0.730943
3,0.730943,0.730943
4,0.703396,0.703396


In [7]:
df_final = df_final[[c for c in df_final.columns.tolist() if c.endswith('_score')==False]]
df_final = df_final[df_final.finetuning_task.isnull()==False]


In [8]:
#Add language column
df_final['language']=df_final.finetuning_task.apply(lambda x: x.split('_')[0])

In [9]:
all_languages = set()
for c in df_final.columns:
    if bool(re.search(r'\w+_predict/','{}'.format(c))):
        language = c.split('_')[0]
        all_languages.add(language)
print(all_languages)

{'ro', 'de', 'lv', 'da', 'mt', 'pt', 'et', 'cs', 'sk', 'nl', 'bg', 'lt', 'fr', 'fi', 'nb', 'it', 'es', 'el', 'hu', 'sv', 'en', 'ga', 'pl'}


In [10]:
# rename some columns
df_final = df_final.rename(columns={'name_or_path': 'pretraining_results','epoch':'number_of_epochs'})

def rename_name_of_task(finetuning_task:str):
    if 'all_languages_' in finetuning_task:
        return re.sub('all_languages_','',finetuning_task)
    else:
        return '_'.join(finetuning_task.split('_')[1:])
    
df_final['finetuning_task']=df_final.finetuning_task.apply(lambda x: rename_name_of_task(x))
    

In [11]:
#Keep only predict columns

def is_important_column(column_name:str):
    if column_name in ['finetuning_task','language','seed','pretraining_results','number_of_epochs']:
        return True
    elif bool(re.search(r'predict.*(f1|precision|recall|accuracy)',column_name)):
        return True
    else:
        return False
    
df_final = df_final[[c for c in df_final.columns if is_important_column(c)==True]]

In [12]:
mapping_columns = ['language', 'finetuning_task', 'task_category', 'literature', 'code', 'processing steps', 'pretraining_type', 'pretraining_subtype', 'pretraining_results', 'pretraining_procedure', 'language_for_pretraining', 'pretraining_on_legal_domain_data', 'NLP_technique_global', 'NLP_technique_specific', 'input', 'labels_considered', 'number_of_labels', 'hyperparameter_tuning', 'hyperparameter_tuning_method', 'hyperparamters', 'metric_for_best_model', 'number_of_runs','number_of_epochs','seed','dataset_for_testing', 'n-fold cross-validation', 'final_score_calculation', 'evaluation_guidelines/method', 'code_to_obtain_scores', 'NER: partial_overlapping_allowed', 'macro-precision', 'macro-recall', 'macro-f1', 'micro-precision', 'micro-recall', 'micro-f1', 'precision', 'recall', 'f1', 'accuracy_normalized']

In [13]:
our_results = list()
for ft in df.finetuning_task.unique():
    multi_lingual_df = df_final #[df_final.language=='all']
    multi_lingual_df.dropna(how='all', axis=1, inplace=True)
    multi_lingual_df = multi_lingual_df.rename(columns=lambda x: re.sub('predict\/_','',x))
    for r in multi_lingual_df.to_dict(orient='records'):
        entry = r
        for col in mapping_columns:
            if col not in entry.keys():
                entry[col]=''
        
        our_results.append(r)
our_results_df = pd.DataFrame(our_results)


#Removing all the results that are specific for one language
for l in all_languages:
    our_results_df = our_results_df[[col for col in our_results_df.columns if col.startswith(l+'_')==False]]
our_results_df = our_results_df.drop_duplicates(['finetuning_task','pretraining_results','seed','macro-precision', 'macro-recall', 'macro-f1'])
remaining_columns = [col for col in our_results_df.columns.tolist() if col not in mapping_columns]
remaining_columns = [col for col in remaining_columns if bool(re.search('(accuracy|precision|recall|f1|seed|epoch)',col))]
#Remove all results from validation and train
remaining_columns = [col for col in remaining_columns if bool(re.search('(eval_|train_)',col))==False]

our_results_df = our_results_df[mapping_columns+remaining_columns]
our_results_df['dataset_for_testing']='test dataset'


In [14]:
sorted(our_results_df.columns.tolist())

['ADDRESS_f1',
 'ADDRESS_precision',
 'ADDRESS_recall',
 'AMOUNT_f1',
 'AMOUNT_precision',
 'AMOUNT_recall',
 'B-JURISPRUDENCIA_f1',
 'B-JURISPRUDENCIA_precision',
 'B-JURISPRUDENCIA_recall',
 'B-LEGISLACAO_f1',
 'B-LEGISLACAO_precision',
 'B-LEGISLACAO_recall',
 'B-LOCAL_f1',
 'B-LOCAL_precision',
 'B-LOCAL_recall',
 'B-ORGANIZACAO_f1',
 'B-ORGANIZACAO_precision',
 'B-ORGANIZACAO_recall',
 'B-PESSOA_f1',
 'B-PESSOA_precision',
 'B-PESSOA_recall',
 'B-TEMPO_f1',
 'B-TEMPO_precision',
 'B-TEMPO_recall',
 'DATE_f1',
 'DATE_precision',
 'DATE_recall',
 'FACILITY_f1',
 'FACILITY_precision',
 'FACILITY_recall',
 'GPE_f1',
 'GPE_precision',
 'GPE_recall',
 'I-JURISPRUDENCIA_f1',
 'I-JURISPRUDENCIA_precision',
 'I-JURISPRUDENCIA_recall',
 'I-LEGISLACAO_f1',
 'I-LEGISLACAO_precision',
 'I-LEGISLACAO_recall',
 'I-LOCAL_f1',
 'I-LOCAL_precision',
 'I-LOCAL_recall',
 'I-ORGANIZACAO_f1',
 'I-ORGANIZACAO_precision',
 'I-ORGANIZACAO_recall',
 'I-PESSOA_f1',
 'I-PESSOA_precision',
 'I-PESSOA_recall',

In [15]:
results_with_mean_values = list()
for ft in our_results_df.finetuning_task.unique():
    for pr in our_results_df.pretraining_results.unique():
        df_sub = our_results_df[(our_results_df.finetuning_task==ft) & (our_results_df.pretraining_results==pr)]
        df_sub.fillna('',inplace=True)
        df_sub_as_dict_first_row = deepcopy(df_sub.to_dict(orient='records')[0])
        relevant_scores = [col for col in df_sub.columns if bool(re.search('(accuracy|precision|recall|f1)',col))]
        for rs in relevant_scores:
            if len([x for x in df_sub[rs].tolist() if type(x)==float])==4:
                mean_value = df_sub[rs].mean()
                df_sub_as_dict_first_row[rs]=mean_value
                df_sub_as_dict_first_row['seed']='mean'
                df_sub_as_dict_first_row['number_of_epochs']=''
                if ft=='german_argument_mining' and pr=='distilbert-base-multilingual-cased':
                    print(rs,mean_value,df_sub[rs].tolist())
                    print(df_sub_as_dict_first_row[rs])

        results_with_mean_values.append(df_sub_as_dict_first_row)
        
results_with_mean_values_df = pd.DataFrame(results_with_mean_values)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


macro-precision 0.7484524340750001 [0.7421947678, 0.7408932278, 0.759109231, 0.7516125097]
0.7484524340750001
macro-recall 0.746087793925 [0.7485784847, 0.7468784962, 0.7462221446, 0.7426720502]
0.746087793925
macro-f1 0.746601546175 [0.745332038, 0.7420129283, 0.752082074, 0.7469791444]
0.746601546175
micro-precision 0.82179987005 [0.8200129955000001, 0.8151397011, 0.8284600390000001, 0.8235867446]
0.82179987005
micro-recall 0.82179987005 [0.8200129955000001, 0.8151397011, 0.8284600390000001, 0.8235867446]
0.82179987005
micro-f1 0.82179987005 [0.8200129955000001, 0.8151397011, 0.8284600390000001, 0.8235867446]
0.82179987005
accuracy_normalized 0.82179987005 [0.8200129955000001, 0.8151397011, 0.8284600390000001, 0.8235867446]
0.82179987005
accuracy_not_normalized 2529.5 [2524.0, 2509.0, 2550.0, 2535.0]
2529.5
weighted-f1 0.82191631995 [0.8203592969, 0.8162518093000001, 0.8278511403000001, 0.8232030333]
0.82191631995
weighted-precision 0.8228853164250001 [0.8207471194, 0.819476370100000

In [16]:
our_results_df_final = pd.concat([results_with_mean_values_df,our_results_df])

In [17]:
our_results_df_final.to_excel('our_results.xlsx', index=False)