## Language

In [21]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../scripts/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs


pd.set_option('display.max_columns', None)
random.seed(42)

args = SimpleNamespace(**{
    'dataset': 'rest-16',
    'model': "meta-llama-Meta-Llama-3-8B",
    'lang': 'en'
})

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_acsd = {}

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task, 
                                                         args.results['model_name'] == args.model,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting,
                                                         args.results['model_lang'] == args.lang])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub_baseline = args.results_baseline[np.logical_and.reduce([args.results_baseline['lr-setting'] == str(args.lr_setting), 
                                                                        args.results_baseline['dataset'] == args.dataset, 
                                                                        args.results_baseline['task'] == args.task, 
                                                                        args.results_baseline['split'] != str(0)])].sort_values(by = ['f1-micro'], ascending = False)

    print(results_sub_baseline)
    results_sub = results_sub[['lang', 'dataset', 'task', 'prompt', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'model_name', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    results_sub_baseline = results_sub_baseline[['task', 'method', 'dataset', 'learning-rate', 'batch_size', 'lr-setting', 'split', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']]

    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['short', 'long']
        baselines = ['hier-gcn', 'mlcf']
    elif args.task == 'acsa':
        prompts = ['short', 'long', 'cot']
        baselines = ['hier-gcn', 'mlcf']
    elif args.task == 'e2e':
        prompts = ['short', 'long', 'cot']
        baselines = ['instructAbsa', 'tas-bert']
    else:
        prompts = ['short', 'long', 'cot']
        baselines = ['para', 'e2tp']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,14]
            f1_prompts[prompt] = f1
        except:
            pass
    
    for method in baselines:
        f1 = {}
        # try:
        for i in range(1, 6): 
            f1[i] = results_sub_baseline[np.logical_and.reduce([results_sub_baseline['split'] == str(i), results_sub_baseline['method'] == method])].iloc[0,8]
        f1_prompts[method] = f1
        # except:
        #     pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)

     # Only use the best performing FT-LLM prompt
    available_prompts = [prompt for prompt in prompts if prompt in df_prompts.columns]

    # Calculate the average F1 scores
    avg_f1 = df_prompts[available_prompts].mean()
    
    # Find the best prompt
    best_prompt = avg_f1.idxmax()
    
    # Identify prompts to drop
    prompts_to_drop = [prompt for prompt in prompts if prompt != best_prompt]
    
    # Drop the other prompts
    df_prompts = df_prompts.drop(columns=prompts_to_drop)
    
    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # if not ((col1 == baselines[0] and col2 == baselines[1]) or (col1 == baselines[1] and col2 == baselines[0])):
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_prompts[col1]),
            'std 1': np.std(df_prompts[col1]),
            'mean 2': np.mean(df_prompts[col2]),
            'std 2': np.std(df_prompts[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df

def computeLowResourceStatistics(args):
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task, 
                                                         args.results['model_name'] == args.model,
                                                         args.results['split'] != str(0),
                                                         args.results['model_lang'] == args.lang])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub = results_sub[['lang', 'dataset', 'task', 'prompt', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'model_name', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]

    if args.task == 'acd':
        prompts = ['short', 'long']
    else:
        prompts = ['short', 'long', 'cot']

    for prompt in prompts:
        
        f1_splits = {}
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,14]
                f1_splits[lr_setting] = f1
            except:
                pass
        
        df_splits = pd.DataFrame(f1_splits)
        
        display(df_splits)
    
        normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}
    
        for key, item in normality_results.items():
            display(item)
        
        all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])
    
        print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        
        if all_normal:
            # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
            rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
            print("Repeated Measures ANOVA Result:")
            print(rm_anova)
        else:
            # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
            friedman = pg.friedman(df_splits)
            print("Friedman Test Result:")
            print(friedman)
    
        # Paarweise Vergleiche
        results = []
        columns = df_splits.columns
        comb = combinations(columns, 2)
        
        for col1, col2 in comb:
            # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
            if all_normal:
                test = 't-test'
                test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
                statistic = test_result['T']['T-test']
            else:
                # Falls nicht, Wilcoxon-Test
                test = 'wilcoxon'
                test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
                statistic = test_result['W-val']['Wilcoxon']
            
            result = {
                'test': test,
                'comparison': f'{col1} vs {col2}',
                'mean 1': np.mean(df_splits[col1]),
                'std 1': np.std(df_splits[col1]),
                'mean 2': np.mean(df_splits[col2]),
                'std 2': np.std(df_splits[col2]),
                'statistic': statistic,
                'p_value': test_result['p-val'].iloc[0]
            }
            results.append(result)
        
        # Erstellung eines DataFrames für die Testergebnisse
        results_df = pd.DataFrame(results)
        
        # Durchführung der Bonferroni-Holm-Korrektur
        corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
        results_df['corrected_p_value'] = corrected_p[1]
        results_df['significant'] = corrected_p[0]

        print('Results for LR-Comparison of : ', prompt)
        display(results_df)


    
    ####
    # Compute based on best performing prompt per low-resource setting
    ####


    
    f1_splits = {}
    
    for prompt in prompts:
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,14]
    
                if lr_setting not in f1_splits.keys() or np.mean(list(f1.values())) > np.mean(list(f1_splits[lr_setting].values())):
                    f1_splits[lr_setting] = f1
            except:
                pass

    df_splits = pd.DataFrame(f1_splits)
        
    display(df_splits)

    normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_splits)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_splits.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_splits[col1]),
            'std 1': np.std(df_splits[col1]),
            'mean 2': np.mean(df_splits[col2]),
            'std 2': np.std(df_splits[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]

    print('Results for LR-Comparison of best Prompt per LR-Setting')
    display(results_df)

# GERestaurant

## ACD

In [22]:
# LLM-based Method

runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)

        if cond_parameters[4] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[4] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[4] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[4] == 'acsd':
            filename = 'metrics_phrases.tsv'

        df = pd.read_csv(os.path.join(RESULTS_PATH,folder_name,filename), sep = '\t')
        df = df.set_index(df.columns[0])
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# Multi-label Classifiaction
METHOD = 'mlcf'
RESULTS_PATH = ''

# col_names = ['task', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier-gcn'
RESULTS_PATH = '../../../ABSA-Baselines/ACSA-HGCN-custom/output_ref'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']


folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(RESULTS_PATH) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'cate_eval_results.txt'), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('=')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = folder_name
    cond_parameters = cond_name.split('_')
    cond_parameters.append(metrics_dict['micro-f1'])
    cond_parameters.extend([None,None])
    cond_parameters[0] = 'acd'
    cond_parameters[1:1] = [METHOD]
    
    runs.append(cond_parameters)
    # except:
    #     pass

results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

In [23]:
results_all[results_all['lr_setting'] == 'orig']

Unnamed: 0,model_lang,dataset,model_shots,model_prompt,model_task,lr,lora_r,lora_alpha,lora_dropout,model_quant,split,lr_setting,model_name,lang,shots,prompt,task,quant,epoch,model_config,path,f1-micro,f1-macro,accuracy
257,en,rest-16,,long,acsa,3e-05,32,32,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,long,acsa,16,7,en_rest-16__long_acsa_3e-05_32_32_0.05_4_orig_...,en_rest-16__long_acsa_3e-05_32_32_0.05_4_0_ori...,0.8161,0.7548,0.6893
259,en,rest-16,,short,acsd,0.0003,8,8,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,short,acsd,16,9,en_rest-16__short_acsd_0.0003_8_8_0.05_4_orig_...,en_rest-16__short_acsd_0.0003_8_8_0.05_4_0_ori...,0.7248,0.603,0.5684
308,en,GERestaurant,,short,acd,3e-05,32,64,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,short,acd,16,10,en_GERestaurant__short_acd_3e-05_32_64_0.05_4_...,en_GERestaurant__short_acd_3e-05_32_64_0.05_4_...,0.8843,0.8787,0.7927
1017,en,rest-16,,short,acd,0.0003,8,16,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,short,acd,16,8,en_rest-16__short_acd_0.0003_8_16_0.05_4_orig_...,en_rest-16__short_acd_0.0003_8_16_0.05_4_0_ori...,0.8333,0.7461,0.7143
1025,en,GERestaurant,,short,acsa,3e-05,8,16,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,short,acsa,16,8,en_GERestaurant__short_acsa_3e-05_8_16_0.05_4_...,en_GERestaurant__short_acsa_3e-05_8_16_0.05_4_...,0.8443,0.8459,0.7305
1482,en,rest-16,,cot,e2e,0.0003,8,8,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,cot,e2e,16,8,en_rest-16__cot_e2e_0.0003_8_8_0.05_4_orig_met...,en_rest-16__cot_e2e_0.0003_8_8_0.05_4_0_orig_m...,0.81,0.7462,0.6807
1571,en,GERestaurant,,long,acd,0.0003,8,16,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,long,acd,16,10,en_GERestaurant__long_acd_0.0003_8_16_0.05_4_o...,en_GERestaurant__long_acd_0.0003_8_16_0.05_4_0...,0.8767,0.8744,0.7804
2270,en,rest-16,,cot,acsa,3e-05,32,64,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,cot,acsa,16,5,en_rest-16__cot_acsa_3e-05_32_64_0.05_4_orig_m...,en_rest-16__cot_acsa_3e-05_32_64_0.05_4_0_orig...,0.8255,0.7663,0.7028
2508,en,GERestaurant,,long,acsa,3e-05,32,32,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,long,acsa,16,9,en_GERestaurant__long_acsa_3e-05_32_32_0.05_4_...,en_GERestaurant__long_acsa_3e-05_32_32_0.05_4_...,0.847,0.8438,0.7346
2703,en,GERestaurant,,cot,e2e,0.0003,8,16,0.05,4,0,orig,meta-llama-Meta-Llama-3-8B,en,,cot,e2e,16,7,en_GERestaurant__cot_e2e_0.0003_8_16_0.05_4_or...,en_GERestaurant__cot_e2e_0.0003_8_16_0.05_4_0_...,0.6748,0.6153,0.5092


### Full Dataset

In [24]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

    task    method  dataset lr-setting split learning-rate batch_size epochs  \
331  acd  hier-gcn  rest-16          0     4         5e-05          8   20.0   
305  acd  hier-gcn  rest-16          0     1         5e-05          8   20.0   
320  acd  hier-gcn  rest-16          0     2         5e-05          8   20.0   
288  acd  hier-gcn  rest-16          0     5         5e-05          8   20.0   
321  acd  hier-gcn  rest-16          0     3         5e-05          8   20.0   
286  acd      mlcf  rest-16          0     1         6e-05         16      3   
162  acd      mlcf  rest-16          0     3         6e-05         16      3   
47   acd      mlcf  rest-16          0     2         6e-05         16      3   
130  acd      mlcf  rest-16          0     4         6e-05         16      3   
150  acd      mlcf  rest-16          0     5         6e-05         16      3   

     f1-micro  f1-macro  accuracy  
331  0.846051       NaN       NaN  
305  0.831492       NaN       NaN  
320  0.8159

Unnamed: 0,short,long,hier-gcn,mlcf
1,0.8299,0.8497,0.831492,0.7776
2,0.8694,0.8407,0.815934,0.7615
3,0.8243,0.8493,0.807471,0.775
4,0.8509,0.8606,0.846051,0.7605
5,0.8459,0.8603,0.814305,0.7278


Unnamed: 0,W,pval,normal
long,0.891068,0.362504,True


Unnamed: 0,W,pval,normal
hier-gcn,0.914769,0.49677,True


Unnamed: 0,W,pval,normal
mlcf,0.851566,0.199519,True


    split    prompt        f1
0       1      long  0.849700
1       2      long  0.840700
2       3      long  0.849300
3       4      long  0.860600
4       5      long  0.860300
5       1  hier-gcn  0.831492
6       2  hier-gcn  0.815934
7       3  hier-gcn  0.807471
8       4  hier-gcn  0.846051
9       5  hier-gcn  0.814305
10      1      mlcf  0.777600
11      2      mlcf  0.761500
12      3      mlcf  0.775000
13      4      mlcf  0.760500
14      5      mlcf  0.727800
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  47.644578  0.000036  0.885842  0.746243


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs hier-gcn,0.85212,0.007524,0.823051,0.013929,4.606105,0.009985,0.009985,True
1,t-test,long vs mlcf,0.85212,0.007524,0.76048,0.017737,8.07103,0.00128,0.00384,True
2,t-test,hier-gcn vs mlcf,0.823051,0.013929,0.76048,0.017737,6.03701,0.003796,0.007592,True


### 1000

In [25]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

    task    method  dataset lr-setting split learning-rate batch_size epochs  \
324  acd  hier-gcn  rest-16       1000     1         5e-05          8   20.0   
295  acd  hier-gcn  rest-16       1000     4         5e-05          8   20.0   
330  acd  hier-gcn  rest-16       1000     3         5e-05          8   20.0   
329  acd  hier-gcn  rest-16       1000     5         5e-05          8   20.0   
314  acd  hier-gcn  rest-16       1000     2         5e-05          8   20.0   
44   acd      mlcf  rest-16       1000     5         5e-05         16      5   
10   acd      mlcf  rest-16       1000     2         5e-05         16      5   
56   acd      mlcf  rest-16       1000     1         5e-05         16      5   
21   acd      mlcf  rest-16       1000     3         5e-05         16      5   
5    acd      mlcf  rest-16       1000     4         5e-05         16      5   

     f1-micro  f1-macro  accuracy  
324  0.832418       NaN       NaN  
295  0.811908       NaN       NaN  
330  0.7935

Unnamed: 0,short,long,hier-gcn,mlcf
1,0.8578,0.6897,0.832418,0.7633
2,0.8453,0.8212,0.788966,0.7715
3,0.8031,0.8113,0.79351,0.753
4,0.7668,0.8498,0.811908,0.3673
5,0.8,0.8152,0.78925,0.7801


Unnamed: 0,W,pval,normal
short,0.937715,0.649833,True


Unnamed: 0,W,pval,normal
hier-gcn,0.830513,0.140358,True


Unnamed: 0,W,pval,normal
mlcf,0.606467,0.000763,False


    split    prompt        f1
0       1     short  0.857800
1       2     short  0.845300
2       3     short  0.803100
3       4     short  0.766800
4       5     short  0.800000
5       1  hier-gcn  0.832418
6       2  hier-gcn  0.788966
7       3  hier-gcn  0.793510
8       4  hier-gcn  0.811908
9       5  hier-gcn  0.789250
10      1      mlcf  0.763300
11      2      mlcf  0.771500
12      3      mlcf  0.753000
13      4      mlcf  0.367300
14      5      mlcf  0.780100
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,short vs hier-gcn,0.8146,0.032982,0.80321,0.016856,4.0,0.4375,0.4375,False
1,wilcoxon,short vs mlcf,0.8146,0.032982,0.68704,0.160121,0.0,0.0625,0.1875,False
2,wilcoxon,hier-gcn vs mlcf,0.80321,0.016856,0.68704,0.160121,0.0,0.0625,0.1875,False


### 500

In [26]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

    task    method  dataset lr-setting split learning-rate batch_size epochs  \
325  acd  hier-gcn  rest-16        500     1         5e-05          8   20.0   
318  acd  hier-gcn  rest-16        500     2         5e-05          8   20.0   
287  acd      mlcf  rest-16        500     1         8e-05         16     10   
298  acd  hier-gcn  rest-16        500     4         5e-05          8   20.0   
308  acd  hier-gcn  rest-16        500     3         5e-05          8   20.0   
323  acd  hier-gcn  rest-16        500     5         5e-05          8   20.0   
11   acd      mlcf  rest-16        500     3         8e-05         16     10   
39   acd      mlcf  rest-16        500     5         8e-05         16     10   
36   acd      mlcf  rest-16        500     4         8e-05         16     10   
261  acd      mlcf  rest-16        500     2         8e-05         16     10   

     f1-micro  f1-macro  accuracy  
325  0.769448       NaN       NaN  
318  0.759712       NaN       NaN  
287  0.7567

Unnamed: 0,short,long,hier-gcn,mlcf
1,0.7657,0.7965,0.769448,0.7567
2,0.8287,0.784,0.759712,0.0
3,0.8143,0.7809,0.739264,0.7359
4,0.8493,0.761,0.746099,0.6897
5,0.7995,0.8134,0.73716,0.7336


Unnamed: 0,W,pval,normal
short,0.986424,0.965739,True


Unnamed: 0,W,pval,normal
hier-gcn,0.907965,0.455458,True


Unnamed: 0,W,pval,normal
mlcf,0.618197,0.001083,False


    split    prompt        f1
0       1     short  0.765700
1       2     short  0.828700
2       3     short  0.814300
3       4     short  0.849300
4       5     short  0.799500
5       1  hier-gcn  0.769448
6       2  hier-gcn  0.759712
7       3  hier-gcn  0.739264
8       4  hier-gcn  0.746099
9       5  hier-gcn  0.737160
10      1      mlcf  0.756700
11      2      mlcf  0.000000
12      3      mlcf  0.735900
13      4      mlcf  0.689700
14      5      mlcf  0.733600
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,short vs hier-gcn,0.8115,0.028193,0.750337,0.012388,1.0,0.125,0.1875,False
1,wilcoxon,short vs mlcf,0.8115,0.028193,0.58318,0.292405,0.0,0.0625,0.1875,False
2,wilcoxon,hier-gcn vs mlcf,0.750337,0.012388,0.58318,0.292405,0.0,0.0625,0.1875,False


In [27]:
args.task = 'acd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8578,0.7657,0.8299
2,0.8453,0.8287,0.8694
3,0.8031,0.8143,0.8243
4,0.7668,0.8493,0.8509
5,0.8,0.7995,0.8459


Unnamed: 0,W,pval,normal
1000,0.937715,0.649833,True


Unnamed: 0,W,pval,normal
500,0.986424,0.965739,True


Unnamed: 0,W,pval,normal
full,0.956999,0.786948,True


    split prompt      f1
0       1   1000  0.8578
1       2   1000  0.8453
2       3   1000  0.8031
3       4   1000  0.7668
4       5   1000  0.8000
5       1    500  0.7657
6       2    500  0.8287
7       3    500  0.8143
8       4    500  0.8493
9       5    500  0.7995
10      1   full  0.8299
11      2   full  0.8694
12      3   full  0.8243
13      4   full  0.8509
14      5   full  0.8459
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2       eps
0  prompt      2      8  1.551417  0.269541  0.23213  0.543105
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8146,0.032982,0.8115,0.028193,0.110849,0.917075,0.917075,False
1,t-test,1000 vs full,0.8146,0.032982,0.84408,0.01602,-1.617569,0.181064,0.362129,False
2,t-test,500 vs full,0.8115,0.028193,0.84408,0.01602,-2.790447,0.04929,0.147869,False


Unnamed: 0,1000,500,full
1,0.6897,0.7965,0.8497
2,0.8212,0.784,0.8407
3,0.8113,0.7809,0.8493
4,0.8498,0.761,0.8606
5,0.8152,0.8134,0.8603


Unnamed: 0,W,pval,normal
1000,0.767566,0.042937,False


Unnamed: 0,W,pval,normal
500,0.986798,0.967326,True


Unnamed: 0,W,pval,normal
full,0.891068,0.362504,True


    split prompt      f1
0       1   1000  0.6897
1       2   1000  0.8212
2       3   1000  0.8113
3       4   1000  0.8498
4       5   1000  0.8152
5       1    500  0.7965
6       2    500  0.7840
7       3    500  0.7809
8       4    500  0.7610
9       5    500  0.8134
10      1   full  0.8497
11      2   full  0.8407
12      3   full  0.8493
13      4   full  0.8606
14      5   full  0.8603
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.79744,0.055537,0.78716,0.017374,5.0,0.625,0.625,False
1,wilcoxon,1000 vs full,0.79744,0.055537,0.85212,0.007524,0.0,0.0625,0.1875,False
2,wilcoxon,500 vs full,0.78716,0.017374,0.85212,0.007524,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.8578,0.7657,0.8497
2,0.8453,0.8287,0.8407
3,0.8031,0.8143,0.8493
4,0.7668,0.8493,0.8606
5,0.8,0.7995,0.8603


Unnamed: 0,W,pval,normal
1000,0.937715,0.649833,True


Unnamed: 0,W,pval,normal
500,0.986424,0.965739,True


Unnamed: 0,W,pval,normal
full,0.891068,0.362504,True


    split prompt      f1
0       1   1000  0.8578
1       2   1000  0.8453
2       3   1000  0.8031
3       4   1000  0.7668
4       5   1000  0.8000
5       1    500  0.7657
6       2    500  0.8287
7       3    500  0.8143
8       4    500  0.8493
9       5    500  0.7995
10      1   full  0.8497
11      2   full  0.8407
12      3   full  0.8493
13      4   full  0.8606
14      5   full  0.8603
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F    p-unc       ng2       eps
0  prompt      2      8  2.25058  0.16771  0.345225  0.635555
Results for LR-Comparison of best Prompt per LR-Setting


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8146,0.032982,0.8115,0.028193,0.110849,0.917075,0.917075,False
1,t-test,1000 vs full,0.8146,0.032982,0.85212,0.007524,-1.922588,0.126893,0.253786,False
2,t-test,500 vs full,0.8115,0.028193,0.85212,0.007524,-2.87248,0.045355,0.136065,False


## ACSA

In [28]:
runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_asp_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# Multi-label Classifiaction
METHOD = 'mlcf'
RESULTS_PATH = ''

col_names = ['task', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier-gcn'
RESULTS_PATH = '../../../ABSA-Baselines/ACSA-HGCN-custom/output_ref'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']


folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(RESULTS_PATH) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'eval_results.txt'), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('=')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = folder_name
    cond_parameters = cond_name.split('_')
    cond_parameters.append(metrics_dict['micro-f1'])
    cond_parameters.extend([None,None])
    cond_parameters[0] = 'acsa'
    cond_parameters[1:1] = [METHOD]
    
    runs.append(cond_parameters)

results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [29]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

     task    method  dataset lr-setting split learning-rate batch_size epochs  \
187  acsa  hier-gcn  rest-16          0     4         5e-05          8   20.0   
144  acsa  hier-gcn  rest-16          0     5         5e-05          8   20.0   
161  acsa  hier-gcn  rest-16          0     1         5e-05          8   20.0   
176  acsa  hier-gcn  rest-16          0     2         5e-05          8   20.0   
177  acsa  hier-gcn  rest-16          0     3         5e-05          8   20.0   
18   acsa      mlcf  rest-16          0     3         8e-05         16      3   
24   acsa      mlcf  rest-16          0     4         8e-05         16      3   
3    acsa      mlcf  rest-16          0     5         8e-05         16      3   
80   acsa      mlcf  rest-16          0     1         8e-05         16      3   
8    acsa      mlcf  rest-16          0     2         8e-05         16      3   

     f1-micro  f1-macro  accuracy  
187  0.757697       NaN       NaN  
144  0.731774       NaN       NaN  


Unnamed: 0,short,long,cot,hier-gcn,mlcf
1,0.8038,0.8407,0.7755,0.718232,0.5276
2,0.8266,0.8242,0.8108,0.708791,0.4063
3,0.7688,0.7827,0.7809,0.704023,0.5627
4,0.8351,0.8237,0.8065,0.757697,0.5331
5,0.8015,0.7638,0.7965,0.731774,0.5324


Unnamed: 0,W,pval,normal
short,0.939782,0.664398,True


Unnamed: 0,W,pval,normal
hier-gcn,0.913764,0.490537,True


Unnamed: 0,W,pval,normal
mlcf,0.746507,0.027618,False


    split    prompt        f1
0       1     short  0.803800
1       2     short  0.826600
2       3     short  0.768800
3       4     short  0.835100
4       5     short  0.801500
5       1  hier-gcn  0.718232
6       2  hier-gcn  0.708791
7       3  hier-gcn  0.704023
8       4  hier-gcn  0.757697
9       5  hier-gcn  0.731774
10      1      mlcf  0.527600
11      2      mlcf  0.406300
12      3      mlcf  0.562700
13      4      mlcf  0.533100
14      5      mlcf  0.532400
Friedman Test Result:
          Source    W  ddof1     Q     p-unc
Friedman  Within  1.0      2  10.0  0.006738


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,short vs hier-gcn,0.80716,0.023123,0.724104,0.019285,0.0,0.0625,0.1875,False
1,wilcoxon,short vs mlcf,0.80716,0.023123,0.51242,0.054492,0.0,0.0625,0.1875,False
2,wilcoxon,hier-gcn vs mlcf,0.724104,0.019285,0.51242,0.054492,0.0,0.0625,0.1875,False


### 1000

In [30]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

     task    method  dataset lr-setting split learning-rate batch_size epochs  \
151  acsa  hier-gcn  rest-16       1000     4         5e-05          8   20.0   
180  acsa  hier-gcn  rest-16       1000     1         5e-05          8   20.0   
185  acsa  hier-gcn  rest-16       1000     5         5e-05          8   20.0   
170  acsa  hier-gcn  rest-16       1000     2         5e-05          8   20.0   
186  acsa  hier-gcn  rest-16       1000     3         5e-05          8   20.0   
13   acsa      mlcf  rest-16       1000     4         6e-05         16      5   
81   acsa      mlcf  rest-16       1000     2         6e-05         16      5   
131  acsa      mlcf  rest-16       1000     1         6e-05         16      5   
106  acsa      mlcf  rest-16       1000     5         6e-05         16      5   
66   acsa      mlcf  rest-16       1000     3         6e-05         16      5   

     f1-micro  f1-macro  accuracy  
151  0.730717       NaN       NaN  
180  0.728022       NaN       NaN  


Unnamed: 0,short,long,cot,hier-gcn,mlcf
1,0.8005,0.7934,0.799,0.728022,0.4313
2,0.8162,0.8026,0.8162,0.70069,0.4487
3,0.8015,0.799,0.7729,0.690265,0.3361
4,0.8009,0.8029,0.8126,0.730717,0.5071
5,0.747,0.8111,0.7955,0.704385,0.3825


Unnamed: 0,W,pval,normal
long,0.96814,0.863187,True


Unnamed: 0,W,pval,normal
hier-gcn,0.890936,0.361833,True


Unnamed: 0,W,pval,normal
mlcf,0.989841,0.979135,True


    split    prompt        f1
0       1      long  0.793400
1       2      long  0.802600
2       3      long  0.799000
3       4      long  0.802900
4       5      long  0.811100
5       1  hier-gcn  0.728022
6       2  hier-gcn  0.700690
7       3  hier-gcn  0.690265
8       4  hier-gcn  0.730717
9       5  hier-gcn  0.704385
10      1      mlcf  0.431300
11      2      mlcf  0.448700
12      3      mlcf  0.336100
13      4      mlcf  0.507100
14      5      mlcf  0.382500
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F         p-unc       ng2       eps
0  prompt      2      8  157.827321  3.732782e-07  0.955503  0.548717


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs hier-gcn,0.8018,0.005775,0.710816,0.015864,9.894587,0.000586,0.000618,True
1,t-test,long vs mlcf,0.8018,0.005775,0.42114,0.05827,12.934065,0.000206,0.000618,True
2,t-test,hier-gcn vs mlcf,0.710816,0.015864,0.42114,0.05827,12.335026,0.000248,0.000618,True


### 500

In [31]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

     task    method  dataset lr-setting split learning-rate batch_size epochs  \
181  acsa  hier-gcn  rest-16        500     1         5e-05          8   20.0   
174  acsa  hier-gcn  rest-16        500     2         5e-05          8   20.0   
179  acsa  hier-gcn  rest-16        500     5         5e-05          8   20.0   
154  acsa  hier-gcn  rest-16        500     4         5e-05          8   20.0   
164  acsa  hier-gcn  rest-16        500     3         5e-05          8   20.0   
113  acsa      mlcf  rest-16        500     1         5e-05         16     10   
107  acsa      mlcf  rest-16        500     2         5e-05         16     10   
82   acsa      mlcf  rest-16        500     3         5e-05         16     10   
115  acsa      mlcf  rest-16        500     5         5e-05         16     10   
141  acsa      mlcf  rest-16        500     4         5e-05         16     10   

     f1-micro  f1-macro  accuracy  
181  0.650636       NaN       NaN  
174  0.644604       NaN       NaN  


Unnamed: 0,short,long,cot,hier-gcn,mlcf
1,0.7713,0.7619,0.731,0.650636,0.5028
2,0.7537,0.7784,0.747,0.644604,0.4739
3,0.7572,0.7551,0.7782,0.622699,0.4597
4,0.8155,0.759,0.7588,0.624113,0.3142
5,0.7837,0.7807,0.7474,0.637462,0.4093


Unnamed: 0,W,pval,normal
short,0.904294,0.434072,True


Unnamed: 0,W,pval,normal
hier-gcn,0.908551,0.458933,True


Unnamed: 0,W,pval,normal
mlcf,0.905928,0.443509,True


    split    prompt        f1
0       1     short  0.771300
1       2     short  0.753700
2       3     short  0.757200
3       4     short  0.815500
4       5     short  0.783700
5       1  hier-gcn  0.650636
6       2  hier-gcn  0.644604
7       3  hier-gcn  0.622699
8       4  hier-gcn  0.624113
9       5  hier-gcn  0.637462
10      1      mlcf  0.502800
11      2      mlcf  0.473900
12      3      mlcf  0.459700
13      4      mlcf  0.314200
14      5      mlcf  0.409300
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  60.514542  0.000015  0.922931  0.507508


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs hier-gcn,0.77628,0.022325,0.635903,0.011032,9.879331,0.000589,0.001767,True
1,t-test,short vs mlcf,0.77628,0.022325,0.43198,0.066223,7.935293,0.001365,0.002731,True
2,t-test,hier-gcn vs mlcf,0.635903,0.011032,0.43198,0.066223,6.846021,0.002382,0.002731,True


In [32]:
args.task = 'acsa'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8005,0.7713,0.8038
2,0.8162,0.7537,0.8266
3,0.8015,0.7572,0.7688
4,0.8009,0.8155,0.8351
5,0.747,0.7837,0.8015


Unnamed: 0,W,pval,normal
1000,0.751384,0.030653,False


Unnamed: 0,W,pval,normal
500,0.904294,0.434072,True


Unnamed: 0,W,pval,normal
full,0.939782,0.664398,True


    split prompt      f1
0       1   1000  0.8005
1       2   1000  0.8162
2       3   1000  0.8015
3       4   1000  0.8009
4       5   1000  0.7470
5       1    500  0.7713
6       2    500  0.7537
7       3    500  0.7572
8       4    500  0.8155
9       5    500  0.7837
10      1   full  0.8038
11      2   full  0.8266
12      3   full  0.7688
13      4   full  0.8351
14      5   full  0.8015
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.52      2  5.2  0.074274
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.79322,0.023853,0.77628,0.022325,4.0,0.4375,0.625,False
1,wilcoxon,1000 vs full,0.79322,0.023853,0.80716,0.023123,3.0,0.3125,0.625,False
2,wilcoxon,500 vs full,0.77628,0.022325,0.80716,0.023123,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.7934,0.7619,0.8407
2,0.8026,0.7784,0.8242
3,0.799,0.7551,0.7827
4,0.8029,0.759,0.8237
5,0.8111,0.7807,0.7638


Unnamed: 0,W,pval,normal
1000,0.96814,0.863187,True


Unnamed: 0,W,pval,normal
500,0.857079,0.217951,True


Unnamed: 0,W,pval,normal
full,0.896665,0.391727,True


    split prompt      f1
0       1   1000  0.7934
1       2   1000  0.8026
2       3   1000  0.7990
3       4   1000  0.8029
4       5   1000  0.8111
5       1    500  0.7619
6       2    500  0.7784
7       3    500  0.7551
8       4    500  0.7590
9       5    500  0.7807
10      1   full  0.8407
11      2   full  0.8242
12      3   full  0.7827
13      4   full  0.8237
14      5   full  0.7638
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  4.998058  0.039052  0.491937  0.542371
Results for LR-Comparison of :  long


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8018,0.005775,0.76702,0.010481,8.859486,0.000896,0.002689,True
1,t-test,1000 vs full,0.8018,0.005775,0.80702,0.028869,-0.314686,0.768732,0.768732,False
2,t-test,500 vs full,0.76702,0.010481,0.80702,0.028869,-2.40325,0.074095,0.14819,False


Unnamed: 0,1000,500,full
1,0.799,0.731,0.7755
2,0.8162,0.747,0.8108
3,0.7729,0.7782,0.7809
4,0.8126,0.7588,0.8065
5,0.7955,0.7474,0.7965


Unnamed: 0,W,pval,normal
1000,0.922351,0.545216,True


Unnamed: 0,W,pval,normal
500,0.960821,0.813691,True


Unnamed: 0,W,pval,normal
full,0.916515,0.507713,True


    split prompt      f1
0       1   1000  0.7990
1       2   1000  0.8162
2       3   1000  0.7729
3       4   1000  0.8126
4       5   1000  0.7955
5       1    500  0.7310
6       2    500  0.7470
7       3    500  0.7782
8       4    500  0.7588
9       5    500  0.7474
10      1   full  0.7755
11      2   full  0.8108
12      3   full  0.7809
13      4   full  0.8065
14      5   full  0.7965
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2      eps
0  prompt      2      8  12.367813  0.003567  0.662145  0.57425
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.79924,0.015323,0.75248,0.015612,3.43032,0.026528,0.053056,False
1,t-test,1000 vs full,0.79924,0.015323,0.79404,0.013846,0.993028,0.376904,0.376904,False
2,t-test,500 vs full,0.75248,0.015612,0.79404,0.013846,-4.048222,0.015498,0.046495,True


Unnamed: 0,1000,500,full
1,0.7934,0.7713,0.8038
2,0.8026,0.7537,0.8266
3,0.799,0.7572,0.7688
4,0.8029,0.8155,0.8351
5,0.8111,0.7837,0.8015


Unnamed: 0,W,pval,normal
1000,0.96814,0.863187,True


Unnamed: 0,W,pval,normal
500,0.904294,0.434072,True


Unnamed: 0,W,pval,normal
full,0.939782,0.664398,True


    split prompt      f1
0       1   1000  0.7934
1       2   1000  0.8026
2       3   1000  0.7990
3       4   1000  0.8029
4       5   1000  0.8111
5       1    500  0.7713
6       2    500  0.7537
7       3    500  0.7572
8       4    500  0.8155
9       5    500  0.7837
10      1   full  0.8038
11      2   full  0.8266
12      3   full  0.7688
13      4   full  0.8351
14      5   full  0.8015
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      2      8  4.47559  0.049609  0.338017  0.994903
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8018,0.005775,0.77628,0.022325,2.390152,0.075155,0.150311,False
1,t-test,1000 vs full,0.8018,0.005775,0.80716,0.023123,-0.47178,0.661668,0.661668,False
2,t-test,500 vs full,0.77628,0.022325,0.80716,0.023123,-2.796691,0.048977,0.14693,False


## E2E

In [33]:
runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# InstructABSA
METHOD = 'instructAbsa'
RESULTS_PATH = '../../../ABSA-Baselines/InstructABSA-Custom/Output_ref'
runs = []
col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

file_names = [file for file in os.listdir(RESULTS_PATH) if len(file.split('.tsv')) > 1 and file != '.ipynb_checkpoints']

for file_name in file_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, file_name), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = file_name.split('.tsv')[0]
    cond_parameters = cond_name.split('_')
    
    cond_parameters.append(metrics_dict['F1-Score'])
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method
    cond_parameters.insert(6, 8)       # Batch Size

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'
    
    runs.append(cond_parameters)


# TAS-BERT

METHOD = 'tas-bert'
RESULTS_PATH = '../../../ABSA-Baselines/TAS-BERT-Custom/results/rest-16/three_joint/BIO'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [file for file in os.listdir(RESULTS_PATH) if file != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.append(float(f1))
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [34]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

   task        method  dataset lr-setting split learning-rate batch_size  \
4   e2e  instructAbsa  rest-16          0     2         5e-05          8   
39  e2e  instructAbsa  rest-16          0     4         5e-05          8   
7   e2e  instructAbsa  rest-16          0     5         5e-05          8   
14  e2e  instructAbsa  rest-16          0     3         5e-05          8   
1   e2e  instructAbsa  rest-16          0     1         5e-05          8   
51  e2e      tas-bert  rest-16          0     4         2e-05         24   
42  e2e      tas-bert  rest-16          0     3         2e-05         24   
53  e2e      tas-bert  rest-16          0     1         2e-05         24   
47  e2e      tas-bert  rest-16          0     5         2e-05         24   
46  e2e      tas-bert  rest-16          0     2         2e-05         24   

   epochs  f1-micro f1-macro accuracy  
4     4.0  0.767943     None     None  
39    4.0  0.764777     None     None  
7     4.0  0.760049     None     None  
14 

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.7434,0.7594,0.7633,0.741259,0.7132
2,0.7986,0.8103,0.7761,0.767943,0.6883
3,0.8067,0.785,0.7807,0.747549,0.7219
4,0.837,0.8235,0.8103,0.764777,0.7223
5,0.774,0.8227,0.7779,0.760049,0.7025


Unnamed: 0,W,pval,normal
long,0.87736,0.297517,True


Unnamed: 0,W,pval,normal
instructAbsa,0.914237,0.493465,True


Unnamed: 0,W,pval,normal
tas-bert,0.89703,0.393689,True


    split        prompt        f1
0       1          long  0.759400
1       2          long  0.810300
2       3          long  0.785000
3       4          long  0.823500
4       5          long  0.822700
5       1  instructAbsa  0.741259
6       2  instructAbsa  0.767943
7       3  instructAbsa  0.747549
8       4  instructAbsa  0.764777
9       5  instructAbsa  0.760049
10      1      tas-bert  0.713200
11      2      tas-bert  0.688300
12      3      tas-bert  0.721900
13      4      tas-bert  0.722300
14      5      tas-bert  0.702500
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  30.780225  0.000175  0.823271  0.620962


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs instructAbsa,0.80018,0.024685,0.756315,0.01024,5.485648,0.005379,0.012349,True
1,t-test,long vs tas-bert,0.80018,0.024685,0.70964,0.012881,5.904877,0.004116,0.012349,True
2,t-test,instructAbsa vs tas-bert,0.756315,0.01024,0.70964,0.012881,4.654294,0.009631,0.012349,True


### 1000

In [35]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

   task        method  dataset lr-setting split learning-rate batch_size  \
33  e2e  instructAbsa  rest-16       1000     3         5e-05          8   
32  e2e  instructAbsa  rest-16       1000     1         5e-05          8   
2   e2e  instructAbsa  rest-16       1000     2         5e-05          8   
24  e2e  instructAbsa  rest-16       1000     4         5e-05          8   
22  e2e  instructAbsa  rest-16       1000     5         5e-05          8   
55  e2e      tas-bert  rest-16       1000     4         2e-05         24   
48  e2e      tas-bert  rest-16       1000     1         2e-05         24   
52  e2e      tas-bert  rest-16       1000     5         2e-05         24   
50  e2e      tas-bert  rest-16       1000     2         2e-05         24   
54  e2e      tas-bert  rest-16       1000     3         2e-05         24   

   epochs  f1-micro f1-macro accuracy  
33      7  0.763682     None     None  
32      7  0.760664     None     None  
2       7  0.745238     None     None  
24 

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.7372,0.7485,0.7055,0.760664,0.6834
2,0.7985,0.7527,0.7305,0.745238,0.674
3,0.7769,0.7442,0.7175,0.763682,0.6349
4,0.7531,0.7875,0.7612,0.74313,0.705
5,0.7919,0.7951,0.7126,0.73494,0.6822


Unnamed: 0,W,pval,normal
short,0.933575,0.620945,True


Unnamed: 0,W,pval,normal
instructAbsa,0.912883,0.485108,True


Unnamed: 0,W,pval,normal
tas-bert,0.90863,0.459404,True


    split        prompt        f1
0       1         short  0.737200
1       2         short  0.798500
2       3         short  0.776900
3       4         short  0.753100
4       5         short  0.791900
5       1  instructAbsa  0.760664
6       2  instructAbsa  0.745238
7       3  instructAbsa  0.763682
8       4  instructAbsa  0.743130
9       5  instructAbsa  0.734940
10      1      tas-bert  0.683400
11      2      tas-bert  0.674000
12      3      tas-bert  0.634900
13      4      tas-bert  0.705000
14      5      tas-bert  0.682200
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  18.311226  0.001033  0.809212  0.910393


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.77152,0.023192,0.749531,0.010922,1.467867,0.216051,0.216051,False
1,t-test,short vs tas-bert,0.77152,0.023192,0.6759,0.022923,5.042956,0.007267,0.021801,True
2,t-test,instructAbsa vs tas-bert,0.749531,0.010922,0.6759,0.022923,4.774141,0.008813,0.021801,True


### 500

In [36]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

   task        method  dataset lr-setting split learning-rate batch_size  \
16  e2e  instructAbsa  rest-16        500     1         5e-05          8   
36  e2e  instructAbsa  rest-16        500     4         5e-05          8   
19  e2e  instructAbsa  rest-16        500     2         5e-05          8   
5   e2e  instructAbsa  rest-16        500     3         5e-05          8   
20  e2e  instructAbsa  rest-16        500     5         5e-05          8   
49  e2e      tas-bert  rest-16        500     4         2e-05         24   
44  e2e      tas-bert  rest-16        500     3         2e-05         24   
57  e2e      tas-bert  rest-16        500     5         2e-05         24   
41  e2e      tas-bert  rest-16        500     2         2e-05         24   
40  e2e      tas-bert  rest-16        500     1         2e-05         24   

   epochs  f1-micro f1-macro accuracy  
16     14  0.772563     None     None  
36     14  0.752969     None     None  
19     14  0.752116     None     None  
5  

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.6486,0.6512,0.6322,0.772563,0.5622
2,0.7085,0.7546,0.7139,0.752116,0.6037
3,0.7415,0.7419,0.5722,0.739454,0.6074
4,0.6145,0.5988,0.6933,0.752969,0.6102
5,0.7631,0.7359,0.691,0.726176,0.6069


Unnamed: 0,W,pval,normal
long,0.846621,0.184073,True


Unnamed: 0,W,pval,normal
instructAbsa,0.97497,0.906084,True


Unnamed: 0,W,pval,normal
tas-bert,0.659647,0.00345,False


    split        prompt        f1
0       1          long  0.651200
1       2          long  0.754600
2       3          long  0.741900
3       4          long  0.598800
4       5          long  0.735900
5       1  instructAbsa  0.772563
6       2  instructAbsa  0.752116
7       3  instructAbsa  0.739454
8       4  instructAbsa  0.752969
9       5  instructAbsa  0.726176
10      1      tas-bert  0.562200
11      2      tas-bert  0.603700
12      3      tas-bert  0.607400
13      4      tas-bert  0.610200
14      5      tas-bert  0.606900
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.48      2  4.8  0.090718


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,long vs instructAbsa,0.69648,0.06097,0.748656,0.015441,6.0,0.8125,0.8125,False
1,wilcoxon,long vs tas-bert,0.69648,0.06097,0.59808,0.018058,1.0,0.125,0.25,False
2,wilcoxon,instructAbsa vs tas-bert,0.748656,0.015441,0.59808,0.018058,0.0,0.0625,0.1875,False


In [37]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7372,0.6486,0.7434
2,0.7985,0.7085,0.7986
3,0.7769,0.7415,0.8067
4,0.7531,0.6145,0.837
5,0.7919,0.7631,0.774


Unnamed: 0,W,pval,normal
1000,0.933575,0.620945,True


Unnamed: 0,W,pval,normal
500,0.940534,0.669716,True


Unnamed: 0,W,pval,normal
full,0.988693,0.974917,True


    split prompt      f1
0       1   1000  0.7372
1       2   1000  0.7985
2       3   1000  0.7769
3       4   1000  0.7531
4       5   1000  0.7919
5       1    500  0.6486
6       2    500  0.7085
7       3    500  0.7415
8       4    500  0.6145
9       5    500  0.7631
10      1   full  0.7434
11      2   full  0.7986
12      3   full  0.8067
13      4   full  0.8370
14      5   full  0.7740
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  8.080671  0.012019  0.527435  0.55893
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.77152,0.023192,0.69524,0.055884,3.778895,0.019454,0.058363,False
1,t-test,1000 vs full,0.77152,0.023192,0.79194,0.03153,-1.159663,0.310692,0.310692,False
2,t-test,500 vs full,0.69524,0.055884,0.79194,0.03153,-2.778397,0.0499,0.0998,False


Unnamed: 0,1000,500,full
1,0.7485,0.6512,0.7594
2,0.7527,0.7546,0.8103
3,0.7442,0.7419,0.785
4,0.7875,0.5988,0.8235
5,0.7951,0.7359,0.8227


Unnamed: 0,W,pval,normal
1000,0.824998,0.127529,True


Unnamed: 0,W,pval,normal
500,0.846621,0.184073,True


Unnamed: 0,W,pval,normal
full,0.87736,0.297517,True


    split prompt      f1
0       1   1000  0.7485
1       2   1000  0.7527
2       3   1000  0.7442
3       4   1000  0.7875
4       5   1000  0.7951
5       1    500  0.6512
6       2    500  0.7546
7       3    500  0.7419
8       4    500  0.5988
9       5    500  0.7359
10      1   full  0.7594
11      2   full  0.8103
12      3   full  0.7850
13      4   full  0.8235
14      5   full  0.8227
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  7.147704  0.016577  0.538418  0.533623
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.7656,0.021292,0.69648,0.06097,1.96682,0.120608,0.120608,False
1,t-test,1000 vs full,0.7656,0.021292,0.80018,0.024685,-4.501765,0.010808,0.032424,True
2,t-test,500 vs full,0.69648,0.06097,0.80018,0.024685,-3.206118,0.03271,0.065419,False


Unnamed: 0,1000,500,full
1,0.7055,0.6322,0.7633
2,0.7305,0.7139,0.7761
3,0.7175,0.5722,0.7807
4,0.7612,0.6933,0.8103
5,0.7126,0.691,0.7779


Unnamed: 0,W,pval,normal
1000,0.885572,0.335326,True


Unnamed: 0,W,pval,normal
500,0.881799,0.317541,True


Unnamed: 0,W,pval,normal
full,0.86961,0.26485,True


    split prompt      f1
0       1   1000  0.7055
1       2   1000  0.7305
2       3   1000  0.7175
3       4   1000  0.7612
4       5   1000  0.7126
5       1    500  0.6322
6       2    500  0.7139
7       3    500  0.5722
8       4    500  0.6933
9       5    500  0.6910
10      1   full  0.7633
11      2   full  0.7761
12      3   full  0.7807
13      4   full  0.8103
14      5   full  0.7779
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  18.812014  0.000945  0.688999  0.515429
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.72546,0.019646,0.66052,0.051876,2.80087,0.048768,0.048768,True
1,t-test,1000 vs full,0.72546,0.019646,0.78166,0.015513,-14.580444,0.000129,0.000386,True
2,t-test,500 vs full,0.66052,0.051876,0.78166,0.015513,-4.866196,0.008242,0.016485,True


Unnamed: 0,1000,500,full
1,0.7372,0.6512,0.7594
2,0.7985,0.7546,0.8103
3,0.7769,0.7419,0.785
4,0.7531,0.5988,0.8235
5,0.7919,0.7359,0.8227


Unnamed: 0,W,pval,normal
1000,0.933575,0.620945,True


Unnamed: 0,W,pval,normal
500,0.846621,0.184073,True


Unnamed: 0,W,pval,normal
full,0.87736,0.297517,True


    split prompt      f1
0       1   1000  0.7372
1       2   1000  0.7985
2       3   1000  0.7769
3       4   1000  0.7531
4       5   1000  0.7919
5       1    500  0.6512
6       2    500  0.7546
7       3    500  0.7419
8       4    500  0.5988
9       5    500  0.7359
10      1   full  0.7594
11      2   full  0.8103
12      3   full  0.7850
13      4   full  0.8235
14      5   full  0.8227
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  10.505473  0.005782  0.541077  0.507507
Results for LR-Comparison of best Prompt per LR-Setting


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.77152,0.023192,0.69648,0.06097,3.472866,0.025517,0.076551,False
1,t-test,1000 vs full,0.77152,0.023192,0.80018,0.024685,-2.565732,0.062259,0.076551,False
2,t-test,500 vs full,0.69648,0.06097,0.80018,0.024685,-3.206118,0.03271,0.076551,False


## E2E - without Implicit

In [38]:
runs = []
RESULTS_PATH = '../results_final/filtered/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# InstructABSA
METHOD = 'instructAbsa'
RESULTS_PATH = '../../../ABSA-Baselines/InstructABSA-Custom/Output_filtered'
runs = []

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

file_names = [file for file in os.listdir(RESULTS_PATH) if len(file.split('.tsv')) > 1 and file != '.ipynb_checkpoints']

for file_name in file_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, file_name), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = file_name.split('.tsv')[0]
    cond_parameters = cond_name.split('_')
    
    cond_parameters.append(metrics_dict['F1-Score'])
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method
    cond_parameters.insert(6, 8)       # Batch Size

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'
    
    runs.append(cond_parameters)


# TAS-BERT

METHOD = 'tas-bert'
RESULTS_PATH = '../../../ABSA-Baselines/TAS-BERT-Custom/results_filtered/rest-16/three_joint/BIO'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [file for file in os.listdir(RESULTS_PATH) if file != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.append(float(f1))
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [39]:
args.lr_setting = 0
args.task = 'e2e'

computePromptStatistics(args)

   task        method  dataset lr-setting split learning-rate batch_size  \
52  e2e      tas-bert  rest-16          0     3         2e-05         24   
49  e2e      tas-bert  rest-16          0     1         2e-05         24   
45  e2e      tas-bert  rest-16          0     4         2e-05         24   
54  e2e      tas-bert  rest-16          0     2         2e-05         24   
50  e2e      tas-bert  rest-16          0     5         2e-05         24   
1   e2e  instructAbsa  rest-16          0     1         5e-05          8   
4   e2e  instructAbsa  rest-16          0     2         5e-05          8   
14  e2e  instructAbsa  rest-16          0     3         5e-05          8   
7   e2e  instructAbsa  rest-16          0     5         5e-05          8   
39  e2e  instructAbsa  rest-16          0     4         5e-05          8   

   epochs  f1-micro f1-macro accuracy  
52   22.0  0.742300     None     None  
49   22.0  0.741900     None     None  
45   22.0  0.707100     None     None  
54 

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.8093,0.7757,0.8147,0.677804,0.7419
2,0.7654,0.7583,0.7953,0.647413,0.6933
3,0.7813,0.7819,0.7974,0.636132,0.7423
4,0.8328,0.7728,0.8099,0.602203,0.7071
5,0.8046,0.755,0.7763,0.60261,0.6876


Unnamed: 0,W,pval,normal
cot,0.941126,0.673908,True


Unnamed: 0,W,pval,normal
instructAbsa,0.910112,0.468263,True


Unnamed: 0,W,pval,normal
tas-bert,0.840393,0.166011,True


    split        prompt        f1
0       1           cot  0.814700
1       2           cot  0.795300
2       3           cot  0.797400
3       4           cot  0.809900
4       5           cot  0.776300
5       1  instructAbsa  0.677804
6       2  instructAbsa  0.647413
7       3  instructAbsa  0.636132
8       4  instructAbsa  0.602203
9       5  instructAbsa  0.602610
10      1      tas-bert  0.741900
11      2      tas-bert  0.693300
12      3      tas-bert  0.742300
13      4      tas-bert  0.707100
14      5      tas-bert  0.687600
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F     p-unc       ng2      eps
0  prompt      2      8  111.148677  0.000001  0.898389  0.90079


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs instructAbsa,0.79872,0.01339,0.633232,0.028624,13.525582,0.000173,0.000519,True
1,t-test,cot vs tas-bert,0.79872,0.01339,0.71444,0.023458,9.251523,0.000759,0.001518,True
2,t-test,instructAbsa vs tas-bert,0.633232,0.028624,0.71444,0.023458,-6.9395,0.002265,0.002265,True


### 1000

In [40]:
args.lr_setting = 1000
args.task = 'e2e'

computePromptStatistics(args)

   task        method  dataset lr-setting split learning-rate batch_size  \
48  e2e      tas-bert  rest-16       1000     4         2e-05         24   
56  e2e      tas-bert  rest-16       1000     3         2e-05         24   
57  e2e      tas-bert  rest-16       1000     1         2e-05         24   
40  e2e      tas-bert  rest-16       1000     2         2e-05         24   
32  e2e  instructAbsa  rest-16       1000     1         5e-05          8   
41  e2e      tas-bert  rest-16       1000     5         2e-05         24   
2   e2e  instructAbsa  rest-16       1000     2         5e-05          8   
22  e2e  instructAbsa  rest-16       1000     5         5e-05          8   
24  e2e  instructAbsa  rest-16       1000     4         5e-05          8   
33  e2e  instructAbsa  rest-16       1000     3         5e-05          8   

   epochs  f1-micro f1-macro accuracy  
48   12.0  0.724400     None     None  
56   12.0  0.723900     None     None  
57   12.0  0.702100     None     None  
40 

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.8099,0.8463,0.7792,0.686916,0.7021
2,0.8013,0.8105,0.6756,0.677262,0.7002
3,0.8094,0.8052,0.7559,0.62406,0.7239
4,0.8129,0.7842,0.7581,0.624849,0.7244
5,0.8111,0.8283,0.7387,0.643373,0.6809


Unnamed: 0,W,pval,normal
long,0.988034,0.972361,True


Unnamed: 0,W,pval,normal
instructAbsa,0.859378,0.226025,True


Unnamed: 0,W,pval,normal
tas-bert,0.896812,0.392516,True


    split        prompt        f1
0       1          long  0.846300
1       2          long  0.810500
2       3          long  0.805200
3       4          long  0.784200
4       5          long  0.828300
5       1  instructAbsa  0.686916
6       2  instructAbsa  0.677262
7       3  instructAbsa  0.624060
8       4  instructAbsa  0.624849
9       5  instructAbsa  0.643373
10      1      tas-bert  0.702100
11      2      tas-bert  0.700200
12      3      tas-bert  0.723900
13      4      tas-bert  0.724400
14      5      tas-bert  0.680900
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  57.217872  0.000018  0.908202  0.699027


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs instructAbsa,0.8149,0.021078,0.651292,0.026256,17.641666,6.1e-05,0.000182,True
1,t-test,long vs tas-bert,0.8149,0.021078,0.7063,0.016355,6.32164,0.003204,0.006407,True
2,t-test,instructAbsa vs tas-bert,0.651292,0.026256,0.7063,0.016355,-2.958536,0.041615,0.041615,True


### 500

In [41]:
args.lr_setting = 500
args.task = 'e2e'

computePromptStatistics(args)

   task        method  dataset lr-setting split learning-rate batch_size  \
47  e2e      tas-bert  rest-16        500     3         2e-05         24   
43  e2e      tas-bert  rest-16        500     1         2e-05         24   
16  e2e  instructAbsa  rest-16        500     1         5e-05          8   
53  e2e      tas-bert  rest-16        500     4         2e-05         24   
19  e2e  instructAbsa  rest-16        500     2         5e-05          8   
36  e2e  instructAbsa  rest-16        500     4         5e-05          8   
20  e2e  instructAbsa  rest-16        500     5         5e-05          8   
51  e2e      tas-bert  rest-16        500     2         2e-05         24   
5   e2e  instructAbsa  rest-16        500     3         5e-05          8   
42  e2e      tas-bert  rest-16        500     5         2e-05         24   

   epochs  f1-micro f1-macro accuracy  
47   28.0  0.726000     None     None  
43   28.0  0.677700     None     None  
16     14  0.670561     None     None  
53 

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.786,0.7871,0.7781,0.670561,0.6777
2,0.7839,0.787,0.7203,0.661836,0.6255
3,0.7014,0.8106,0.7926,0.623116,0.726
4,0.7631,0.7596,0.7727,0.638554,0.6655
5,0.7742,0.7873,0.7529,0.625616,0.6168


Unnamed: 0,W,pval,normal
long,0.883866,0.327197,True


Unnamed: 0,W,pval,normal
instructAbsa,0.887702,0.345682,True


Unnamed: 0,W,pval,normal
tas-bert,0.938631,0.656276,True


    split        prompt        f1
0       1          long  0.787100
1       2          long  0.787000
2       3          long  0.810600
3       4          long  0.759600
4       5          long  0.787300
5       1  instructAbsa  0.670561
6       2  instructAbsa  0.661836
7       3  instructAbsa  0.623116
8       4  instructAbsa  0.638554
9       5  instructAbsa  0.625616
10      1      tas-bert  0.677700
11      2      tas-bert  0.625500
12      3      tas-bert  0.726000
13      4      tas-bert  0.665500
14      5      tas-bert  0.616800
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  34.068911  0.000122  0.846679  0.731849


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs instructAbsa,0.78632,0.016159,0.643936,0.019117,10.29398,0.000502,0.001507,True
1,t-test,long vs tas-bert,0.78632,0.016159,0.6623,0.039318,7.027995,0.00216,0.004319,True
2,t-test,instructAbsa vs tas-bert,0.643936,0.019117,0.6623,0.039318,-0.7804,0.478756,0.478756,False


In [42]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8099,0.786,0.8093
2,0.8013,0.7839,0.7654
3,0.8094,0.7014,0.7813
4,0.8129,0.7631,0.8328
5,0.8111,0.7742,0.8046


Unnamed: 0,W,pval,normal
1000,0.825775,0.129276,True


Unnamed: 0,W,pval,normal
500,0.769106,0.044305,False


Unnamed: 0,W,pval,normal
full,0.976582,0.915563,True


    split prompt      f1
0       1   1000  0.8099
1       2   1000  0.8013
2       3   1000  0.8094
3       4   1000  0.8129
4       5   1000  0.8111
5       1    500  0.7860
6       2    500  0.7839
7       3    500  0.7014
8       4    500  0.7631
9       5    500  0.7742
10      1   full  0.8093
11      2   full  0.7654
12      3   full  0.7813
13      4   full  0.8328
14      5   full  0.8046
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.64      2  6.4  0.040762
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.80892,0.003996,0.76172,0.031233,0.0,0.0625,0.1875,False
1,wilcoxon,1000 vs full,0.80892,0.003996,0.79868,0.023331,3.0,0.3125,0.3125,False
2,wilcoxon,500 vs full,0.76172,0.031233,0.79868,0.023331,1.0,0.125,0.25,False


Unnamed: 0,1000,500,full
1,0.8463,0.7871,0.7757
2,0.8105,0.787,0.7583
3,0.8052,0.8106,0.7819
4,0.7842,0.7596,0.7728
5,0.8283,0.7873,0.755


Unnamed: 0,W,pval,normal
1000,0.988034,0.972361,True


Unnamed: 0,W,pval,normal
500,0.883866,0.327197,True


Unnamed: 0,W,pval,normal
full,0.909621,0.465314,True


    split prompt      f1
0       1   1000  0.8463
1       2   1000  0.8105
2       3   1000  0.8052
3       4   1000  0.7842
4       5   1000  0.8283
5       1    500  0.7871
6       2    500  0.7870
7       3    500  0.8106
8       4    500  0.7596
9       5    500  0.7873
10      1   full  0.7757
11      2   full  0.7583
12      3   full  0.7819
13      4   full  0.7728
14      5   full  0.7550
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  9.523769  0.007653  0.571905  0.851074
Results for LR-Comparison of :  long


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8149,0.021078,0.78632,0.016159,2.672004,0.05569,0.111379,False
1,t-test,1000 vs full,0.8149,0.021078,0.76874,0.010353,3.706936,0.020709,0.062128,False
2,t-test,500 vs full,0.78632,0.016159,0.76874,0.010353,2.06496,0.107844,0.111379,False


Unnamed: 0,1000,500,full
1,0.7792,0.7781,0.8147
2,0.6756,0.7203,0.7953
3,0.7559,0.7926,0.7974
4,0.7581,0.7727,0.8099
5,0.7387,0.7529,0.7763


Unnamed: 0,W,pval,normal
1000,0.864605,0.24526,True


Unnamed: 0,W,pval,normal
500,0.938402,0.65466,True


Unnamed: 0,W,pval,normal
full,0.941126,0.673908,True


    split prompt      f1
0       1   1000  0.7792
1       2   1000  0.6756
2       3   1000  0.7559
3       4   1000  0.7581
4       5   1000  0.7387
5       1    500  0.7781
6       2    500  0.7203
7       3    500  0.7926
8       4    500  0.7727
9       5    500  0.7529
10      1   full  0.8147
11      2   full  0.7953
12      3   full  0.7974
13      4   full  0.8099
14      5   full  0.7763
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2      eps
0  prompt      2      8  11.032157  0.005014  0.448012  0.66555
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.7415,0.035369,0.76332,0.024992,-2.626412,0.058405,0.074332,False
1,t-test,1000 vs full,0.7415,0.035369,0.79872,0.01339,-3.605613,0.022645,0.067934,False
2,t-test,500 vs full,0.76332,0.024992,0.79872,0.01339,-3.0735,0.037166,0.074332,False


Unnamed: 0,1000,500,full
1,0.8463,0.7871,0.8147
2,0.8105,0.787,0.7953
3,0.8052,0.8106,0.7974
4,0.7842,0.7596,0.8099
5,0.8283,0.7873,0.7763


Unnamed: 0,W,pval,normal
1000,0.988034,0.972361,True


Unnamed: 0,W,pval,normal
500,0.883866,0.327197,True


Unnamed: 0,W,pval,normal
full,0.941126,0.673908,True


    split prompt      f1
0       1   1000  0.8463
1       2   1000  0.8105
2       3   1000  0.8052
3       4   1000  0.7842
4       5   1000  0.8283
5       1    500  0.7871
6       2    500  0.7870
7       3    500  0.8106
8       4    500  0.7596
9       5    500  0.7873
10      1   full  0.8147
11      2   full  0.7953
12      3   full  0.7974
13      4   full  0.8099
14      5   full  0.7763
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  2.892451  0.113434  0.317096  0.955563
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8149,0.021078,0.78632,0.016159,2.672004,0.05569,0.167069,False
1,t-test,1000 vs full,0.8149,0.021078,0.79872,0.01339,1.250929,0.279134,0.558269,False
2,t-test,500 vs full,0.78632,0.016159,0.79872,0.01339,-1.032048,0.360362,0.558269,False


## ACSD

In [43]:
runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_phrases.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# Paraphrase Generation
METHOD = 'para'
RESULTS_PATH = ''

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_phrases.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        # Fix for the wrong output name format
        # cond_parameters[0], cond_parameters[1] = cond_parameters[1], cond_parameters[0]
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        runs.append(cond_parameters)
    except:
        pass

# E2TP 
METHOD = 'e2tp'
RESULTS_PATH = '../../../ABSA-Baselines/E2TP-custom/src/results_ref'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.tsv'), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = folder_name.split('/')[-1]
    cond_parameters = cond_name.split('_')

    cond_parameters.append(metrics_dict['f1']/100)
    cond_parameters.extend([None,None])
    cond_parameters[1:1] = [METHOD]
    cond_parameters[3] = 0 if cond_parameters[3] == 'full' else cond_parameters[3]
    
    runs.append(cond_parameters)
    # except:
    #     pass

results_baseline = pd.DataFrame(runs, columns = col_names)
results_baseline['lr-setting'] = results_baseline['lr-setting'].astype(str)

args.results = results_all
args.results_baseline = results_baseline

stats_dfs = {}

### Full Dataset

In [44]:
args.lr_setting = 0
args.task = 'acsd'

stats_acsd['0'] = computePromptStatistics(args)
stats_acsd['0']

    task method  dataset lr-setting split learning-rate batch_size epochs  \
16  acsd   para  rest-16          0     4        0.0003         16     20   
71  acsd   e2tp  rest-16          0     4        0.0001          8     20   
8   acsd   para  rest-16          0     2        0.0003         16     20   
77  acsd   e2tp  rest-16          0     5        0.0001          8     20   
79  acsd   e2tp  rest-16          0     2        0.0001          8     20   
38  acsd   para  rest-16          0     5        0.0003         16     20   
27  acsd   para  rest-16          0     1        0.0003         16     20   
81  acsd   e2tp  rest-16          0     1        0.0001          8     20   
24  acsd   para  rest-16          0     3        0.0003         16     20   
55  acsd   e2tp  rest-16          0     3        0.0001          8     20   

    f1-micro  f1-macro  accuracy  
16  0.745100    0.7120    0.5938  
71  0.733831       NaN       NaN  
8   0.727100    0.6684    0.5712  
77  0.723192

Unnamed: 0,short,long,cot,para,e2tp
1,0.7333,0.7649,0.7035,0.7066,0.705018
2,0.7608,0.7132,0.6676,0.7271,0.720698
3,0.7503,0.7452,0.7001,0.6961,0.684755
4,0.7858,0.7696,0.7481,0.7451,0.733831
5,0.7523,0.7561,0.7077,0.7173,0.723192


Unnamed: 0,W,pval,normal
short,0.952113,0.75227,True


Unnamed: 0,W,pval,normal
para,0.986734,0.967055,True


Unnamed: 0,W,pval,normal
e2tp,0.939612,0.663198,True


    split prompt        f1
0       1  short  0.733300
1       2  short  0.760800
2       3  short  0.750300
3       4  short  0.785800
4       5  short  0.752300
5       1   para  0.706600
6       2   para  0.727100
7       3   para  0.696100
8       4   para  0.745100
9       5   para  0.717300
10      1   e2tp  0.705018
11      2   e2tp  0.720698
12      3   e2tp  0.684755
13      4   e2tp  0.733831
14      5   e2tp  0.723192
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc      ng2      eps
0  prompt      2      8  40.427148  0.000066  0.55974  0.57071


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs para,0.7565,0.017154,0.71844,0.01689,8.257422,0.001173,0.00352,True
1,t-test,short vs e2tp,0.7565,0.017154,0.713499,0.017072,6.060829,0.003742,0.007483,True
2,t-test,para vs e2tp,0.71844,0.01689,0.713499,0.017072,1.518264,0.20356,0.20356,False


### 1000

In [45]:
args.lr_setting = 1000
args.task = 'acsd'

stats_acsd['1000'] = computePromptStatistics(args)
stats_acsd['1000']

    task method  dataset lr-setting split learning-rate batch_size epochs  \
75  acsd   e2tp  rest-16       1000     4        0.0001          8     20   
54  acsd   e2tp  rest-16       1000     5        0.0001          8     20   
4   acsd   para  rest-16       1000     4        0.0003         16     20   
73  acsd   e2tp  rest-16       1000     2        0.0001          8     20   
5   acsd   para  rest-16       1000     2        0.0003         16     20   
64  acsd   e2tp  rest-16       1000     3        0.0001          8     20   
47  acsd   e2tp  rest-16       1000     1        0.0001          8     20   
15  acsd   para  rest-16       1000     5        0.0003         16     20   
13  acsd   para  rest-16       1000     1        0.0003         16     20   
25  acsd   para  rest-16       1000     3        0.0003         16     20   

    f1-micro  f1-macro  accuracy  
75  0.720000       NaN       NaN  
54  0.717752       NaN       NaN  
4   0.699500    0.6900    0.5379  
73  0.698856

Unnamed: 0,short,long,cot,para,e2tp
1,0.702,0.7148,0.678,0.6811,0.692972
2,0.722,0.7295,0.7266,0.6945,0.698856
3,0.7677,0.7282,0.6761,0.6533,0.694408
4,0.7441,0.7821,0.6978,0.6995,0.72
5,0.7217,0.6927,0.7037,0.6906,0.717752


Unnamed: 0,W,pval,normal
short,0.953457,0.761849,True


Unnamed: 0,W,pval,normal
para,0.85663,0.216399,True


Unnamed: 0,W,pval,normal
e2tp,0.816518,0.109729,True


    split prompt        f1
0       1  short  0.702000
1       2  short  0.722000
2       3  short  0.767700
3       4  short  0.744100
4       5  short  0.721700
5       1   para  0.681100
6       2   para  0.694500
7       3   para  0.653300
8       4   para  0.699500
9       5   para  0.690600
10      1   e2tp  0.692972
11      2   e2tp  0.698856
12      3   e2tp  0.694408
13      4   e2tp  0.720000
14      5   e2tp  0.717752
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  7.082748  0.016969  0.556624  0.548689


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs para,0.7315,0.022475,0.6838,0.0164,2.786583,0.049484,0.098969,False
1,t-test,short vs e2tp,0.7315,0.022475,0.704798,0.011679,2.172949,0.095495,0.098969,False
2,t-test,para vs e2tp,0.6838,0.0164,0.704798,0.011679,-3.314277,0.029536,0.088608,False


### 500

In [46]:
args.lr_setting = 500
args.task = 'acsd'

stats_acsd['500'] = computePromptStatistics(args)
stats_acsd['500']

    task method  dataset lr-setting split learning-rate batch_size epochs  \
53  acsd   e2tp  rest-16        500     4        0.0001          8     68   
43  acsd   e2tp  rest-16        500     2        0.0001          8     68   
6   acsd   para  rest-16        500     2        0.0003         16     20   
74  acsd   e2tp  rest-16        500     1        0.0001          8     68   
80  acsd   e2tp  rest-16        500     3        0.0001          8     68   
23  acsd   para  rest-16        500     5        0.0003         16     20   
12  acsd   para  rest-16        500     4        0.0003         16     20   
63  acsd   e2tp  rest-16        500     5        0.0001          8     68   
31  acsd   para  rest-16        500     1        0.0003         16     20   
41  acsd   para  rest-16        500     3        0.0003         16     20   

    f1-micro  f1-macro  accuracy  
53  0.690355       NaN       NaN  
43  0.671717       NaN       NaN  
6   0.670800    0.5426    0.5046  
74  0.666667

Unnamed: 0,short,long,cot,para,e2tp
1,0.6947,0.6674,0.6002,0.6212,0.666667
2,0.7293,0.7377,0.614,0.6708,0.671717
3,0.6923,0.6979,0.6154,0.6189,0.641026
4,0.7307,0.7565,0.6863,0.637,0.690355
5,0.7323,0.6623,0.6182,0.6399,0.635779


Unnamed: 0,W,pval,normal
short,0.743326,0.025785,False


Unnamed: 0,W,pval,normal
para,0.881328,0.315371,True


Unnamed: 0,W,pval,normal
e2tp,0.933621,0.62126,True


    split prompt        f1
0       1  short  0.694700
1       2  short  0.729300
2       3  short  0.692300
3       4  short  0.730700
4       5  short  0.732300
5       1   para  0.621200
6       2   para  0.670800
7       3   para  0.618900
8       4   para  0.637000
9       5   para  0.639900
10      1   e2tp  0.666667
11      2   e2tp  0.671717
12      3   e2tp  0.641026
13      4   e2tp  0.690355
14      5   e2tp  0.635779
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,short vs para,0.71586,0.018297,0.63756,0.018582,0.0,0.0625,0.1875,False
1,wilcoxon,short vs e2tp,0.71586,0.018297,0.661109,0.020218,0.0,0.0625,0.1875,False
2,wilcoxon,para vs e2tp,0.63756,0.018582,0.661109,0.020218,2.0,0.1875,0.1875,False


In [47]:
args.task = 'acsd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.702,0.6947,0.7333
2,0.722,0.7293,0.7608
3,0.7677,0.6923,0.7503
4,0.7441,0.7307,0.7858
5,0.7217,0.7323,0.7523


Unnamed: 0,W,pval,normal
1000,0.953457,0.761849,True


Unnamed: 0,W,pval,normal
500,0.743326,0.025785,False


Unnamed: 0,W,pval,normal
full,0.952113,0.75227,True


    split prompt      f1
0       1   1000  0.7020
1       2   1000  0.7220
2       3   1000  0.7677
3       4   1000  0.7441
4       5   1000  0.7217
5       1    500  0.6947
6       2    500  0.7293
7       3    500  0.6923
8       4    500  0.7307
9       5    500  0.7323
10      1   full  0.7333
11      2   full  0.7608
12      3   full  0.7503
13      4   full  0.7858
14      5   full  0.7523
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.52      2  5.2  0.074274
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.7315,0.022475,0.71586,0.018297,4.5,0.625,0.625,False
1,wilcoxon,1000 vs full,0.7315,0.022475,0.7565,0.017154,1.0,0.125,0.25,False
2,wilcoxon,500 vs full,0.71586,0.018297,0.7565,0.017154,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.7148,0.6674,0.7649
2,0.7295,0.7377,0.7132
3,0.7282,0.6979,0.7452
4,0.7821,0.7565,0.7696
5,0.6927,0.6623,0.7561


Unnamed: 0,W,pval,normal
1000,0.913809,0.490817,True


Unnamed: 0,W,pval,normal
500,0.903737,0.430882,True


Unnamed: 0,W,pval,normal
full,0.883094,0.323566,True


    split prompt      f1
0       1   1000  0.7148
1       2   1000  0.7295
2       3   1000  0.7282
3       4   1000  0.7821
4       5   1000  0.6927
5       1    500  0.6674
6       2    500  0.7377
7       3    500  0.6979
8       4    500  0.7565
9       5    500  0.6623
10      1   full  0.7649
11      2   full  0.7132
12      3   full  0.7452
13      4   full  0.7696
14      5   full  0.7561
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      2      8  3.48814  0.081422  0.279336  0.543913
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.72946,0.029458,0.70436,0.037435,2.753789,0.051175,0.153524,False
1,t-test,1000 vs full,0.72946,0.029458,0.7498,0.020102,-1.264852,0.274596,0.274596,False
2,t-test,500 vs full,0.70436,0.037435,0.7498,0.020102,-1.938461,0.124598,0.249195,False


Unnamed: 0,1000,500,full
1,0.678,0.6002,0.7035
2,0.7266,0.614,0.6676
3,0.6761,0.6154,0.7001
4,0.6978,0.6863,0.7481
5,0.7037,0.6182,0.7077


Unnamed: 0,W,pval,normal
1000,0.920545,0.533459,True


Unnamed: 0,W,pval,normal
500,0.734861,0.021421,False


Unnamed: 0,W,pval,normal
full,0.931031,0.603416,True


    split prompt      f1
0       1   1000  0.6780
1       2   1000  0.7266
2       3   1000  0.6761
3       4   1000  0.6978
4       5   1000  0.7037
5       1    500  0.6002
6       2    500  0.6140
7       3    500  0.6154
8       4    500  0.6863
9       5    500  0.6182
10      1   full  0.7035
11      2   full  0.6676
12      3   full  0.7001
13      4   full  0.7481
14      5   full  0.7077
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.69644,0.018536,0.62682,0.030383,0.0,0.0625,0.1875,False
1,wilcoxon,1000 vs full,0.69644,0.018536,0.7054,0.025648,5.0,0.625,0.625,False
2,wilcoxon,500 vs full,0.62682,0.030383,0.7054,0.025648,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.702,0.6947,0.7333
2,0.722,0.7293,0.7608
3,0.7677,0.6923,0.7503
4,0.7441,0.7307,0.7858
5,0.7217,0.7323,0.7523


Unnamed: 0,W,pval,normal
1000,0.953457,0.761849,True


Unnamed: 0,W,pval,normal
500,0.743326,0.025785,False


Unnamed: 0,W,pval,normal
full,0.952113,0.75227,True


    split prompt      f1
0       1   1000  0.7020
1       2   1000  0.7220
2       3   1000  0.7677
3       4   1000  0.7441
4       5   1000  0.7217
5       1    500  0.6947
6       2    500  0.7293
7       3    500  0.6923
8       4    500  0.7307
9       5    500  0.7323
10      1   full  0.7333
11      2   full  0.7608
12      3   full  0.7503
13      4   full  0.7858
14      5   full  0.7523
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.52      2  5.2  0.074274
Results for LR-Comparison of best Prompt per LR-Setting


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.7315,0.022475,0.71586,0.018297,4.5,0.625,0.625,False
1,wilcoxon,1000 vs full,0.7315,0.022475,0.7565,0.017154,1.0,0.125,0.25,False
2,wilcoxon,500 vs full,0.71586,0.018297,0.7565,0.017154,0.0,0.0625,0.1875,False


## Create Latex

In [50]:
import pandas as pd

def extract_means_and_stds(stats):
    """ 
    Extract the mean and std values for two methods from the statistical dataframe.
    """
    # Initialize a dictionary to store extracted values
    results = {
        "acd": {},
        "acsa": {},
        "e2e": {},
        "acsd": {}
    }
    for task, dfs in stats.items():
        for lr_setting, df in dfs.items():
            if df is not None:
                results[task][lr_setting] = {}
                # Find the row in the dataframe that corresponds to the comparison between method1 and method2
                for _, row in df.iterrows():
                    comparison = row['comparison']
                    mean1, mean2 = row['mean 1'], row['mean 2']
                    std1, std2 = row['std 1'], row['std 2']
            
                    # Map the means and stds to the correct methods
                    method1, method2 = comparison.split(' vs ')
                    
                    if method1 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method1] = {'mean': None, 'std': None}
                        results[task][lr_setting][method1]['mean'], results[task][lr_setting][method1]['std'] = mean1*100, std1*100
                        
                    if method2 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method2] = {'mean': None, 'std': None} 
                        results[task][lr_setting][method2]['mean'], results[task][lr_setting][method2]['std'] = mean2*100, std2*100
    
    return results

def create_full_latex_row(task_results, resource_setting):
    """
    Creates a full LaTeX row for a specific resource setting across all tasks.
    
    Parameters:
    - task_results: A dictionary containing results_dicts for all tasks (e.g., {'ACD': results_dict1, 'ACSA': results_dict2, 'ACSD': results_dict3}).
    - resource_setting: The resource setting (e.g., 'Full', '1000', '500').
    
    Returns:
    - A LaTeX formatted string representing a full row of the table.
    """

    rs_text = resource_setting if resource_setting != '0' else 'Full'
    latex_row = r"\multicolumn{1}{r|}{" + rs_text + "} & "

    for task, results_dict in task_results.items():
        if task in ['acd', 'acsa']:
            task_methods = ['mlcf', 'hier-gcn']
        elif task == 'e2e':
            task_methods = ['instructAbsa', 'tas-bert']
        elif task == 'acsd':
            task_methods = ['e2tp', 'para']
            
        if resource_setting in results_dict.keys():
            methods = results_dict[resource_setting]
            # Find the method with the highest mean value among short, long, cot
            highest_prompt = [prompt_style for prompt_style in methods if prompt_style in ['short', 'long', 'cot']][0]
            highest_method = max(task_methods + [highest_prompt], key=lambda x: methods[x]['mean'])
            # Initialize LaTeX row string
            if highest_prompt == highest_method:
                latex_row += (
                    r"\scalebox{0.95}{\textbf{" + f"{methods[highest_prompt]['mean']:.2f}" + "}} & "
                )
            else:
                latex_row += (
                    f"{methods[highest_prompt]['mean']:.2f}" + " & "
                )
            
            # Add the remaining methods
            for i, method in enumerate(task_methods):
                if method in methods.keys():
                    if method == highest_method:
                        latex_row += (
                            r"\multicolumn{1}{c" + f"{'|' if(i == 1 and task != 'acsd') else ''}" +
                            r"}{\scalebox{0.95}{\textbf{" + f"{methods[method]['mean']:.2f}" +
                            r"}}} & "
                        )
                    else:
                        latex_row += (
                            r"\multicolumn{1}{c" + f"{'|' if(i == 1 and task != 'acsd') else ''}" +
                            r"}{" + f"{methods[method]['mean']:.2f}" +
                            r"} & "
                        )
                else:
                    latex_row += r"\multicolumn{1}{l|}{N/A} & "
            
    # Remove the trailing '&' and replace with '\\'
    latex_row = latex_row.rstrip(" & ") + r" \\"
    
    return latex_row
    
results_dict = extract_means_and_stds({'acd':stats_acd, 'acsa':stats_acsa, 'e2e': stats_e2e, 'acsd':stats_acsd})

latex = []
latex.append(create_full_latex_row(results_dict, '0'))
latex.append(create_full_latex_row(results_dict, '1000'))
latex.append(create_full_latex_row(results_dict, '500'))

for l in latex:
    print(l)
    print("&")


\multicolumn{1}{r|}{Full} & \scalebox{0.95}{\textbf{85.21}} & \multicolumn{1}{c}{76.05} & \multicolumn{1}{c|}{82.31} & \scalebox{0.95}{\textbf{80.72}} & \multicolumn{1}{c}{51.24} & \multicolumn{1}{c|}{72.41} & \scalebox{0.95}{\textbf{80.02}} & \multicolumn{1}{c}{75.63} & \multicolumn{1}{c|}{70.96} & \scalebox{0.95}{\textbf{75.65}} & \multicolumn{1}{c}{71.35} & \multicolumn{1}{c}{71.84} \\
&
\multicolumn{1}{r|}{1000} & \scalebox{0.95}{\textbf{81.46}} & \multicolumn{1}{c}{68.70} & \multicolumn{1}{c|}{80.32} & \scalebox{0.95}{\textbf{80.18}} & \multicolumn{1}{c}{42.11} & \multicolumn{1}{c|}{71.08} & \scalebox{0.95}{\textbf{77.15}} & \multicolumn{1}{c}{74.95} & \multicolumn{1}{c|}{67.59} & \scalebox{0.95}{\textbf{73.15}} & \multicolumn{1}{c}{70.48} & \multicolumn{1}{c}{68.38} \\
&
\multicolumn{1}{r|}{500} & \scalebox{0.95}{\textbf{81.15}} & \multicolumn{1}{c}{58.32} & \multicolumn{1}{c|}{75.03} & \scalebox{0.95}{\textbf{77.63}} & \multicolumn{1}{c}{43.20} & \multicolumn{1}{c|}{63.59} & 69.

In [73]:
results_dict

{'acd': {'0': {'long': {'mean': 85.21199999999999, 'std': 0.7523935140602945},
   'hier-gcn': {'mean': 82.30506555336025, 'std': 1.3929140032065679},
   'mlcf': {'mean': 76.04799999999999, 'std': 1.7736673870824817}},
  '1000': {'short': {'mean': 81.46000000000001, 'std': 3.298175253075555},
   'hier-gcn': {'mean': 80.32103523022734, 'std': 1.6855850694549288},
   'mlcf': {'mean': 68.704, 'std': 16.0120974266334}},
  '500': {'short': {'mean': 81.15, 'std': 2.819276502934751},
   'hier-gcn': {'mean': 75.03367637863283, 'std': 1.2388248748576085},
   'mlcf': {'mean': 58.31799999999999, 'std': 29.240526260654065}}},
 'acsa': {'0': {'short': {'mean': 80.716, 'std': 2.3123027483441674},
   'hier-gcn': {'mean': 72.4103622678853, 'std': 1.9285041110013303},
   'mlcf': {'mean': 51.242, 'std': 5.449188563446855}},
  '1000': {'long': {'mean': 80.17999999999998, 'std': 0.5774772722800448},
   'hier-gcn': {'mean': 71.08158058984833, 'std': 1.5863988295830342},
   'mlcf': {'mean': 42.114, 'std': 5.

## Performance Comparison of Extraction of ABSA-Tuple Elements over different ABSA Subtasks

In [87]:
# Additional Eval

runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_asp.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'short'), ('full', 'long'), ('full', 'context'),
    ('1000', 'short'), ('1000', 'long'), ('1000', 'context'),
    ('500', 'short'), ('500', 'long'), ('500', 'context')
])

# Define the row indices
index = ['acd', 'acsa', 'acsd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'acsd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                         results_all['task'] == task, 
                                                         results_all['model_name'] == args.model,
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting,
                                                         results_all['model_lang'] == 'en'])].groupby(['model_config']):

            prompt_name = a[0].split('_')[3] if a[0].split('_')[3] != 'cot' else 'context'
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect Extraction')
display(df)

print(f"Average difference ACSA to ACD: {(np.mean([float(i) for i in list(df.loc['acsa'])]) - np.mean([float(i) for i in list(df.loc['acd']) if i != 'N/A'])):.2f}")

print(f"Average difference ACSD to ACSA: {(np.mean([float(i) for i in list(df.loc['acsd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}")

print(f"Average difference ACSD to ACD: {(np.mean([float(i) for i in list(df.loc['acsd'])]) - np.mean([float(i) for i in list(df.loc['acd'])  if i != 'N/A'])):.2f}")

Aspect Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,short,long,context,short,long,context,short,long,context
acd,84.41,85.21,,81.46,79.74,,81.15,78.72,
acsa,85.33,85.1,84.02,84.37,84.94,84.04,81.72,80.32,78.74
acsd,86.37,85.94,82.54,84.9,84.89,84.47,82.38,81.24,79.4


Average difference ACSA to ACD: 1.39
Average difference ACSD to ACSA: 0.39
Average difference ACSD to ACD: 1.79


In [88]:
# Additional Eval

runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_asp_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'short'), ('full', 'long'), ('full', 'context'),
    ('1000', 'short'), ('1000', 'long'), ('1000', 'context'),
    ('500', 'short'), ('500', 'long'), ('500', 'context')
])

# Define the row indices
index = ['acd', 'acsa', 'acsd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'acsd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                         results_all['task'] == task, 
                                                         results_all['model_name'] == args.model,
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting,
                                                         results_all['model_lang'] == 'en'])].groupby(['model_config']):

            prompt_name = a[0].split('_')[3] if a[0].split('_')[3] != 'cot' else 'context'
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect + Polarity Extraction')
display(df)

f"Average difference: {(np.mean([float(i) for i in list(df.loc['acsd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}"


Aspect + Polarity Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,short,long,context,short,long,context,short,long,context
acd,,,,,,,,,
acsa,80.72,80.7,79.4,79.32,80.18,79.92,77.63,76.7,75.25
acsd,82.29,82.1,78.82,80.17,80.04,80.34,78.82,77.33,75.65


'Average difference: 0.64'

In [9]:
# Eval for best parameter combination over all tasks and dataset sizes

RESULTS_PATH = '../results_final'
DATASET = 'rest-16'

col_names = ['lang', 'dataset', 'few_shot', 'prompt', 'task', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'quant', 'split', 'lr_setting', 'model', 'prompt_lang', 'prompt_few_shot', 'prompt_prompt', 'prompt_task', 'prompt_quant', 'epoch', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        filename = ''
        
        if cond_parameters[4] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[4] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[4] == 'e2e':
            filename = 'pol.tsv'
        elif cond_parameters[4] == 'acsd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

results_sub = results_all[np.logical_and.reduce([results_all['dataset'] == DATASET, results_all['split'] == '0'])].sort_values(by = ['f1-micro'], ascending = False)
results_sub = results_sub[results_sub['lr_setting'] != 'orig']
results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lr_setting', 'lora_r', 'lora_alpha', 'epoch', 'f1-micro', 'f1-macro']]
results_sub = results_sub.reset_index()

idx_max = results_sub.groupby(['lr_setting', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].idxmax()
results_per_epoch = results_sub.loc[idx_max]

results_per_epoch.groupby(['learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].mean()

learning_rate  lora_r  lora_alpha
0.0003         32      32            0.778796
                       64            0.735858
               8       16            0.778783
                       8             0.795646
3e-05          32      32            0.770671
                       64            0.782188
               8       16            0.766229
                       8             0.750042
Name: f1-micro, dtype: float64