## Language

In [15]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../scripts/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = SimpleNamespace(**{
    'dataset': 'GERestaurant',
    'model': "meta-llama-Meta-Llama-3-8B",
    'lang': 'en'
})

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_acsd = {}

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task, 
                                                         args.results['model_name'] == args.model,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting,
                                                         args.results['model_lang'] == args.lang])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub_baseline = args.results_baseline[np.logical_and.reduce([args.results_baseline['lr-setting'] == str(args.lr_setting), 
                                                                        args.results_baseline['dataset'] == args.dataset, 
                                                                        args.results_baseline['task'] == args.task, 
                                                                        args.results_baseline['split'] != str(0)])].sort_values(by = ['f1-micro'], ascending = False)

    display(results_sub_baseline)
    results_sub = results_sub[['lang', 'dataset', 'task', 'prompt', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'model_name', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    results_sub_baseline = results_sub_baseline[['task', 'method', 'dataset', 'learning-rate', 'batch_size', 'lr-setting', 'split', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']]

    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['short', 'long']
        baselines = ['hier-gcn', 'mlcf']
    elif args.task == 'acsa':
        prompts = ['short', 'long', 'cot']
        baselines = ['hier-gcn', 'mlcf']
    elif args.task == 'e2e':
        prompts = ['short', 'long', 'cot']
        baselines = ['instructAbsa', 'tas-bert']
    else:
        prompts = ['short', 'long', 'cot']
        baselines = ['para', 'e2tp']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,14]
            f1_prompts[prompt] = f1
        except:
            pass
    
    for method in baselines:
        f1 = {}
        # try:
        for i in range(1, 6): 
            f1[i] = results_sub_baseline[np.logical_and.reduce([results_sub_baseline['split'] == str(i), results_sub_baseline['method'] == method])].iloc[0,8]
        f1_prompts[method] = f1
        # except:
        #     pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)


    # Only use the best performing FT-LLM prompt
    available_prompts = [prompt for prompt in prompts if prompt in df_prompts.columns]

    # Calculate the average F1 scores
    avg_f1 = df_prompts[available_prompts].mean()
    
    # Find the best prompt
    best_prompt = avg_f1.idxmax()
    
    # Identify prompts to drop
    prompts_to_drop = [prompt for prompt in prompts if prompt != best_prompt]
    
    # Drop the other prompts
    df_prompts = df_prompts.drop(columns=prompts_to_drop)

    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # if not ((col1 == baselines[0] and col2 == baselines[1]) or (col1 == baselines[1] and col2 == baselines[0])):
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_prompts[col1]),
            'std 1': np.std(df_prompts[col1]),
            'mean 2': np.mean(df_prompts[col2]),
            'std 2': np.std(df_prompts[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df

def computeLowResourceStatistics(args):
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task, 
                                                         args.results['model_name'] == args.model,
                                                         args.results['split'] != str(0),
                                                         args.results['model_lang'] == args.lang])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub = results_sub[['lang', 'dataset', 'task', 'prompt', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'model_name', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]

    if args.task == 'acd':
        prompts = ['short', 'long']
    else:
        prompts = ['short', 'long', 'cot']

    for prompt in prompts:
        
        f1_splits = {}
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,14]
                f1_splits[lr_setting] = f1
            except:
                pass
        
        df_splits = pd.DataFrame(f1_splits)
        
        display(df_splits)
    
        normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}
    
        for key, item in normality_results.items():
            display(item)
        
        all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])
    
        print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        
        if all_normal:
            # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
            rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
            print("Repeated Measures ANOVA Result:")
            print(rm_anova)
        else:
            # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
            friedman = pg.friedman(df_splits)
            print("Friedman Test Result:")
            print(friedman)
    
        # Paarweise Vergleiche
        results = []
        columns = df_splits.columns
        comb = combinations(columns, 2)
        
        for col1, col2 in comb:
            # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
            if all_normal:
                test = 't-test'
                test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
                statistic = test_result['T']['T-test']
            else:
                # Falls nicht, Wilcoxon-Test
                test = 'wilcoxon'
                test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
                statistic = test_result['W-val']['Wilcoxon']
            
            result = {
                'test': test,
                'comparison': f'{col1} vs {col2}',
                'mean 1': np.mean(df_splits[col1]),
                'std 1': np.std(df_splits[col1]),
                'mean 2': np.mean(df_splits[col2]),
                'std 2': np.std(df_splits[col2]),
                'statistic': statistic,
                'p_value': test_result['p-val'].iloc[0]
            }
            results.append(result)
        
        # Erstellung eines DataFrames für die Testergebnisse
        results_df = pd.DataFrame(results)
        
        # Durchführung der Bonferroni-Holm-Korrektur
        corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
        results_df['corrected_p_value'] = corrected_p[1]
        results_df['significant'] = corrected_p[0]

        print('Results for LR-Comparison of : ', prompt)
        display(results_df)


    
    ####
    # Compute based on best performing prompt per low-resource setting
    ####


    
    f1_splits = {}
    
    for prompt in prompts:
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,14]
    
                if lr_setting not in f1_splits.keys() or np.mean(list(f1.values())) > np.mean(list(f1_splits[lr_setting].values())):
                    f1_splits[lr_setting] = f1
            except:
                pass

    df_splits = pd.DataFrame(f1_splits)
        
    display(df_splits)

    normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_splits)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_splits.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_splits[col1]),
            'std 1': np.std(df_splits[col1]),
            'mean 2': np.mean(df_splits[col2]),
            'std 2': np.std(df_splits[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]

    print('Results for LR-Comparison of best Prompt per LR-Setting')
    display(results_df)



# GERestaurant

## ACD

In [16]:
# LLM-based Method

runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_asp.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# Multi-label Classifiaction
METHOD = 'mlcf'
RESULTS_PATH = ''

# col_names = ['task', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier-gcn'
RESULTS_PATH = '../../../ABSA-Baselines/ACSA-HGCN-custom/output_ref'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']


folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(RESULTS_PATH) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'cate_eval_results.txt'), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('=')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = folder_name
    cond_parameters = cond_name.split('_')
    cond_parameters.append(metrics_dict['micro-f1'])
    cond_parameters.extend([None,None])
    cond_parameters[0] = 'acd'
    cond_parameters[1:1] = [METHOD]
    
    runs.append(cond_parameters)
    # except:
    #     pass

results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [17]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
213,acd,mlcf,GERestaurant,0,2,2e-05,16,3.0,0.9326,0.9189,0.8737
90,acd,mlcf,GERestaurant,0,4,2e-05,16,3.0,0.9264,0.9183,0.8628
257,acd,mlcf,GERestaurant,0,3,2e-05,16,3.0,0.9224,0.9094,0.8559
6,acd,mlcf,GERestaurant,0,5,2e-05,16,3.0,0.9186,0.8959,0.8495
220,acd,mlcf,GERestaurant,0,1,2e-05,16,3.0,0.9147,0.9076,0.8429
306,acd,hier-gcn,GERestaurant,0,2,5e-05,8,20.0,0.910734,,
294,acd,hier-gcn,GERestaurant,0,5,5e-05,8,20.0,0.901149,,
317,acd,hier-gcn,GERestaurant,0,3,5e-05,8,20.0,0.893182,,
302,acd,hier-gcn,GERestaurant,0,4,5e-05,8,20.0,0.892019,,
304,acd,hier-gcn,GERestaurant,0,1,5e-05,8,20.0,0.888636,,


Unnamed: 0,short,long,hier-gcn,mlcf
1,0.8748,0.8801,0.888636,0.9147
2,0.8663,0.8698,0.910734,0.9326
3,0.8757,0.8674,0.893182,0.9224
4,0.8943,0.8895,0.892019,0.9264
5,0.8827,0.8846,0.901149,0.9186


Unnamed: 0,W,pval,normal
short,0.962545,0.825588,True


Unnamed: 0,W,pval,normal
hier-gcn,0.905476,0.440887,True


Unnamed: 0,W,pval,normal
mlcf,0.987348,0.969604,True


    split    prompt        f1
0       1     short  0.874800
1       2     short  0.866300
2       3     short  0.875700
3       4     short  0.894300
4       5     short  0.882700
5       1  hier-gcn  0.888636
6       2  hier-gcn  0.910734
7       3  hier-gcn  0.893182
8       4  hier-gcn  0.892019
9       5  hier-gcn  0.901149
10      1      mlcf  0.914700
11      2      mlcf  0.932600
12      3      mlcf  0.922400
13      4      mlcf  0.926400
14      5      mlcf  0.918600
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  29.170487  0.000211  0.839065  0.597075


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs hier-gcn,0.87876,0.009352,0.897144,0.007938,-2.448854,0.070532,0.070532,False
1,t-test,short vs mlcf,0.87876,0.009352,0.92294,0.006202,-7.320512,0.001853,0.003705,True
2,t-test,hier-gcn vs mlcf,0.897144,0.007938,0.92294,0.006202,-8.832748,0.000907,0.002721,True


### 1000

In [18]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
75,acd,mlcf,GERestaurant,1000,4,2e-05,16,6.0,0.9254,0.9198,0.8612
45,acd,mlcf,GERestaurant,1000,5,2e-05,16,6.0,0.9202,0.8965,0.8522
251,acd,mlcf,GERestaurant,1000,2,2e-05,16,6.0,0.9062,0.8848,0.8285
74,acd,mlcf,GERestaurant,1000,3,2e-05,16,6.0,0.9053,0.8866,0.827
248,acd,mlcf,GERestaurant,1000,1,2e-05,16,6.0,0.904,0.8944,0.8249
297,acd,hier-gcn,GERestaurant,1000,3,5e-05,8,43.0,0.890427,,
303,acd,hier-gcn,GERestaurant,1000,2,5e-05,8,43.0,0.878547,,
332,acd,hier-gcn,GERestaurant,1000,5,5e-05,8,43.0,0.871972,,
299,acd,hier-gcn,GERestaurant,1000,4,5e-05,8,43.0,0.868235,,
290,acd,hier-gcn,GERestaurant,1000,1,5e-05,8,43.0,0.861397,,


Unnamed: 0,short,long,hier-gcn,mlcf
1,0.8798,0.8698,0.861397,0.904
2,0.8423,0.8509,0.878547,0.9062
3,0.8625,0.8555,0.890427,0.9053
4,0.8952,0.8993,0.868235,0.9254
5,0.8527,0.8469,0.871972,0.9202


Unnamed: 0,W,pval,normal
short,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
hier-gcn,0.97509,0.906801,True


Unnamed: 0,W,pval,normal
mlcf,0.816104,0.108917,True


    split    prompt        f1
0       1     short  0.879800
1       2     short  0.842300
2       3     short  0.862500
3       4     short  0.895200
4       5     short  0.852700
5       1  hier-gcn  0.861397
6       2  hier-gcn  0.878547
7       3  hier-gcn  0.890427
8       4  hier-gcn  0.868235
9       5  hier-gcn  0.871972
10      1      mlcf  0.904000
11      2      mlcf  0.906200
12      3      mlcf  0.905300
13      4      mlcf  0.925400
14      5      mlcf  0.920200
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F   p-unc       ng2       eps
0  prompt      2      8  12.218227  0.0037  0.692187  0.696914


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs hier-gcn,0.8665,0.018935,0.874116,0.009865,-0.598268,0.581889,0.581889,False
1,t-test,short vs mlcf,0.8665,0.018935,0.91222,0.008821,-5.248956,0.006302,0.018905,True
2,t-test,hier-gcn vs mlcf,0.874116,0.009865,0.91222,0.008821,-5.058287,0.007189,0.018905,True


### 500

In [19]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
255,acd,mlcf,GERestaurant,500,4,2e-05,16,13.0,0.9085,0.9085,0.8323
276,acd,mlcf,GERestaurant,500,5,2e-05,16,13.0,0.9061,0.8819,0.8284
183,acd,mlcf,GERestaurant,500,3,2e-05,16,13.0,0.9031,0.8849,0.8233
243,acd,mlcf,GERestaurant,500,1,2e-05,16,13.0,0.8969,0.8879,0.8131
124,acd,mlcf,GERestaurant,500,2,2e-05,16,13.0,0.8917,0.8692,0.8046
311,acd,hier-gcn,GERestaurant,500,3,5e-05,8,86.0,0.882759,,
315,acd,hier-gcn,GERestaurant,500,2,5e-05,8,86.0,0.861397,,
326,acd,hier-gcn,GERestaurant,500,4,5e-05,8,20.0,0.861176,,
301,acd,hier-gcn,GERestaurant,500,5,5e-05,8,86.0,0.854801,,
313,acd,hier-gcn,GERestaurant,500,3,5e-05,8,20.0,0.852234,,


Unnamed: 0,short,long,hier-gcn,mlcf
1,0.887,0.8369,0.834091,0.8969
2,0.8264,0.8215,0.861397,0.8917
3,0.8558,0.8258,0.882759,0.9031
4,0.8674,0.8406,0.861176,0.9085
5,0.8694,0.837,0.854801,0.9061


Unnamed: 0,W,pval,normal
short,0.943288,0.689265,True


Unnamed: 0,W,pval,normal
hier-gcn,0.949624,0.734505,True


Unnamed: 0,W,pval,normal
mlcf,0.944973,0.701274,True


    split    prompt        f1
0       1     short  0.887000
1       2     short  0.826400
2       3     short  0.855800
3       4     short  0.867400
4       5     short  0.869400
5       1  hier-gcn  0.834091
6       2  hier-gcn  0.861397
7       3  hier-gcn  0.882759
8       4  hier-gcn  0.861176
9       5  hier-gcn  0.854801
10      1      mlcf  0.896900
11      2      mlcf  0.891700
12      3      mlcf  0.903100
13      4      mlcf  0.908500
14      5      mlcf  0.906100
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  8.810411  0.009506  0.624678  0.534023


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs hier-gcn,0.8612,0.020058,0.858845,0.015575,0.149323,0.888525,0.888525,False
1,t-test,short vs mlcf,0.8612,0.020058,0.90126,0.006158,-4.462873,0.011136,0.022271,True
2,t-test,hier-gcn vs mlcf,0.858845,0.015575,0.90126,0.006158,-5.58651,0.005036,0.015108,True


In [20]:
args.task = 'acd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8798,0.887,0.8748
2,0.8423,0.8264,0.8663
3,0.8625,0.8558,0.8757
4,0.8952,0.8674,0.8943
5,0.8527,0.8694,0.8827


Unnamed: 0,W,pval,normal
1000,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
500,0.943288,0.689265,True


Unnamed: 0,W,pval,normal
full,0.962545,0.825588,True


    split prompt      f1
0       1   1000  0.8798
1       2   1000  0.8423
2       3   1000  0.8625
3       4   1000  0.8952
4       5   1000  0.8527
5       1    500  0.8870
6       2    500  0.8264
7       3    500  0.8558
8       4    500  0.8674
9       5    500  0.8694
10      1   full  0.8748
11      2   full  0.8663
12      3   full  0.8757
13      4   full  0.8943
14      5   full  0.8827
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  2.644723  0.13132  0.160557  0.931991
Results for LR-Comparison of :  short


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8665,0.018935,0.8612,0.020058,0.66752,0.540978,0.540978,False
1,t-test,1000 vs full,0.8665,0.018935,0.87876,0.009352,-1.803222,0.145691,0.336393,False
2,t-test,500 vs full,0.8612,0.020058,0.87876,0.009352,-2.03064,0.112131,0.336393,False


Unnamed: 0,1000,500,full
1,0.8698,0.8369,0.8801
2,0.8509,0.8215,0.8698
3,0.8555,0.8258,0.8674
4,0.8993,0.8406,0.8895
5,0.8469,0.837,0.8846


Unnamed: 0,W,pval,normal
1000,0.85488,0.210443,True


Unnamed: 0,W,pval,normal
500,0.874845,0.286601,True


Unnamed: 0,W,pval,normal
full,0.927728,0.580963,True


  W = np.prod(eig) / (eig.sum() / d) ** d


    split prompt      f1
0       1   1000  0.8698
1       2   1000  0.8509
2       3   1000  0.8555
3       4   1000  0.8993
4       5   1000  0.8469
5       1    500  0.8369
6       2    500  0.8215
7       3    500  0.8258
8       4    500  0.8406
9       5    500  0.8370
10      1   full  0.8801
11      2   full  0.8698
12      3   full  0.8674
13      4   full  0.8895
14      5   full  0.8846
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  27.441433  0.000262  0.694209  0.527279
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.86448,0.019052,0.83236,0.007362,4.122545,0.014582,0.029164,True
1,t-test,1000 vs full,0.86448,0.019052,0.87828,0.008479,-1.804238,0.145519,0.145519,False
2,t-test,500 vs full,0.83236,0.007362,0.87828,0.008479,-31.162476,6e-06,1.9e-05,True


Unnamed: 0,1000,500,full
1,0.8798,0.887,0.8748
2,0.8423,0.8264,0.8663
3,0.8625,0.8558,0.8757
4,0.8952,0.8674,0.8943
5,0.8527,0.8694,0.8827


Unnamed: 0,W,pval,normal
1000,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
500,0.943288,0.689265,True


Unnamed: 0,W,pval,normal
full,0.962545,0.825588,True


    split prompt      f1
0       1   1000  0.8798
1       2   1000  0.8423
2       3   1000  0.8625
3       4   1000  0.8952
4       5   1000  0.8527
5       1    500  0.8870
6       2    500  0.8264
7       3    500  0.8558
8       4    500  0.8674
9       5    500  0.8694
10      1   full  0.8748
11      2   full  0.8663
12      3   full  0.8757
13      4   full  0.8943
14      5   full  0.8827
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  2.644723  0.13132  0.160557  0.931991
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8665,0.018935,0.8612,0.020058,0.66752,0.540978,0.540978,False
1,t-test,1000 vs full,0.8665,0.018935,0.87876,0.009352,-1.803222,0.145691,0.336393,False
2,t-test,500 vs full,0.8612,0.020058,0.87876,0.009352,-2.03064,0.112131,0.336393,False


## ACSA

In [21]:
runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_asp_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# Multi-label Classifiaction
METHOD = 'mlcf'
RESULTS_PATH = ''

col_names = ['task', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier-gcn'
RESULTS_PATH = '../../../ABSA-Baselines/ACSA-HGCN-custom/output_ref'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']


folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(RESULTS_PATH) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'eval_results.txt'), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('=')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = folder_name
    cond_parameters = cond_name.split('_')
    cond_parameters.append(metrics_dict['micro-f1'])
    cond_parameters.extend([None,None])
    cond_parameters[0] = 'acsa'
    cond_parameters[1:1] = [METHOD]
    
    runs.append(cond_parameters)

results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [22]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
16,acsa,mlcf,GERestaurant,0,5,2e-05,16,3.0,0.8543,0.7975,0.7457
158,acsa,hier-gcn,GERestaurant,0,4,5e-05,8,20.0,0.833333,,
150,acsa,hier-gcn,GERestaurant,0,5,5e-05,8,20.0,0.832184,,
71,acsa,mlcf,GERestaurant,0,4,2e-05,16,3.0,0.8296,0.8076,0.7089
12,acsa,mlcf,GERestaurant,0,3,2e-05,16,3.0,0.8261,0.7755,0.7038
46,acsa,mlcf,GERestaurant,0,1,2e-05,16,3.0,0.8261,0.7812,0.7038
162,acsa,hier-gcn,GERestaurant,0,2,5e-05,8,20.0,0.824859,,
139,acsa,mlcf,GERestaurant,0,2,2e-05,16,3.0,0.8225,0.7574,0.6985
173,acsa,hier-gcn,GERestaurant,0,3,5e-05,8,20.0,0.820455,,
160,acsa,hier-gcn,GERestaurant,0,1,5e-05,8,20.0,0.813636,,


Unnamed: 0,short,long,cot,hier-gcn,mlcf
1,0.8348,0.8439,0.8532,0.813636,0.8261
2,0.8256,0.7776,0.8123,0.824859,0.8225
3,0.8226,0.8356,0.8187,0.820455,0.8261
4,0.8659,0.865,0.8234,0.833333,0.8296
5,0.8331,0.8387,0.8194,0.832184,0.8543


Unnamed: 0,W,pval,normal
short,0.807002,0.092301,True


Unnamed: 0,W,pval,normal
hier-gcn,0.936164,0.638962,True


Unnamed: 0,W,pval,normal
mlcf,0.729227,0.018894,False


    split    prompt        f1
0       1     short  0.834800
1       2     short  0.825600
2       3     short  0.822600
3       4     short  0.865900
4       5     short  0.833100
5       1  hier-gcn  0.813636
6       2  hier-gcn  0.824859
7       3  hier-gcn  0.820455
8       4  hier-gcn  0.833333
9       5  hier-gcn  0.832184
10      1      mlcf  0.826100
11      2      mlcf  0.822500
12      3      mlcf  0.826100
13      4      mlcf  0.829600
14      5      mlcf  0.854300
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.36      2  3.6  0.165299


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,short vs hier-gcn,0.8364,0.015432,0.824893,0.007359,0.0,0.0625,0.1875,False
1,wilcoxon,short vs mlcf,0.8364,0.015432,0.83172,0.011511,6.0,0.8125,0.8125,False
2,wilcoxon,hier-gcn vs mlcf,0.824893,0.007359,0.83172,0.011511,3.0,0.3125,0.625,False


### 1000

In [23]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
59,acsa,mlcf,GERestaurant,1000,5,2e-05,16,6.0,0.868,0.8081,0.7667
122,acsa,mlcf,GERestaurant,1000,2,2e-05,16,6.0,0.8499,0.8077,0.7389
135,acsa,mlcf,GERestaurant,1000,4,2e-05,16,6.0,0.8444,0.8175,0.7308
30,acsa,mlcf,GERestaurant,1000,3,2e-05,16,6.0,0.8352,0.8014,0.717
34,acsa,mlcf,GERestaurant,1000,1,2e-05,16,6.0,0.8206,0.7892,0.6958
153,acsa,hier-gcn,GERestaurant,1000,3,5e-05,8,43.0,0.814302,,
159,acsa,hier-gcn,GERestaurant,1000,2,5e-05,8,43.0,0.799092,,
188,acsa,hier-gcn,GERestaurant,1000,5,5e-05,8,43.0,0.798155,,
155,acsa,hier-gcn,GERestaurant,1000,4,5e-05,8,43.0,0.776471,,
146,acsa,hier-gcn,GERestaurant,1000,1,5e-05,8,43.0,0.774341,,


Unnamed: 0,short,long,cot,hier-gcn,mlcf
1,0.8314,0.7744,0.773,0.774341,0.8206
2,0.7488,0.7457,0.802,0.799092,0.8499
3,0.8365,0.7368,0.802,0.814302,0.8352
4,0.8479,0.7973,0.8299,0.776471,0.8444
5,0.7157,0.7579,0.8216,0.798155,0.868


Unnamed: 0,W,pval,normal
cot,0.935542,0.634617,True


Unnamed: 0,W,pval,normal
hier-gcn,0.901363,0.417463,True


Unnamed: 0,W,pval,normal
mlcf,0.994647,0.993171,True


    split    prompt        f1
0       1       cot  0.773000
1       2       cot  0.802000
2       3       cot  0.802000
3       4       cot  0.829900
4       5       cot  0.821600
5       1  hier-gcn  0.774341
6       2  hier-gcn  0.799092
7       3  hier-gcn  0.814302
8       4  hier-gcn  0.776471
9       5  hier-gcn  0.798155
10      1      mlcf  0.820600
11      2      mlcf  0.849900
12      3      mlcf  0.835200
13      4      mlcf  0.844400
14      5      mlcf  0.868000
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  16.592801  0.001424  0.620663  0.712711


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs hier-gcn,0.8057,0.019673,0.792472,0.015083,1.140354,0.317783,0.317783,False
1,t-test,cot vs mlcf,0.8057,0.019673,0.84362,0.015718,-5.865206,0.004219,0.012657,True
2,t-test,hier-gcn vs mlcf,0.792472,0.015083,0.84362,0.015718,-5.772321,0.004472,0.012657,True


### 500

In [24]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
5,acsa,mlcf,GERestaurant,500,5,2e-05,16,13.0,0.8426,0.7523,0.728
67,acsa,mlcf,GERestaurant,500,3,2e-05,16,13.0,0.8154,0.767,0.6883
0,acsa,mlcf,GERestaurant,500,4,2e-05,16,13.0,0.8148,0.779,0.6875
10,acsa,mlcf,GERestaurant,500,2,2e-05,16,13.0,0.8075,0.757,0.6771
43,acsa,mlcf,GERestaurant,500,1,2e-05,16,13.0,0.8044,0.7525,0.6728
167,acsa,hier-gcn,GERestaurant,500,3,5e-05,8,86.0,0.8,,
157,acsa,hier-gcn,GERestaurant,500,5,5e-05,8,86.0,0.775176,,
156,acsa,hier-gcn,GERestaurant,500,4,5e-05,8,86.0,0.761104,,
182,acsa,hier-gcn,GERestaurant,500,4,5e-05,8,20.0,0.757647,,
171,acsa,hier-gcn,GERestaurant,500,2,5e-05,8,86.0,0.756014,,


Unnamed: 0,short,long,cot,hier-gcn,mlcf
1,0.7748,0.8249,0.8168,0.738636,0.8044
2,0.7871,0.7988,0.7859,0.756014,0.8075
3,0.7951,0.7935,0.827,0.8,0.8154
4,0.8316,0.8387,0.8436,0.761104,0.8148
5,0.7568,0.7496,0.8182,0.775176,0.8426


Unnamed: 0,W,pval,normal
cot,0.942179,0.681379,True


Unnamed: 0,W,pval,normal
hier-gcn,0.976119,0.912869,True


Unnamed: 0,W,pval,normal
mlcf,0.817185,0.111049,True


    split    prompt        f1
0       1       cot  0.816800
1       2       cot  0.785900
2       3       cot  0.827000
3       4       cot  0.843600
4       5       cot  0.818200
5       1  hier-gcn  0.738636
6       2  hier-gcn  0.756014
7       3  hier-gcn  0.800000
8       4  hier-gcn  0.761104
9       5  hier-gcn  0.775176
10      1      mlcf  0.804400
11      2      mlcf  0.807500
12      3      mlcf  0.815400
13      4      mlcf  0.814800
14      5      mlcf  0.842600
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  15.704178  0.001698  0.648029  0.931542


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs hier-gcn,0.8183,0.018803,0.766186,0.020555,4.397349,0.011715,0.023431,True
1,t-test,cot vs mlcf,0.8183,0.018803,0.81694,0.013504,0.130551,0.902433,0.902433,False
2,t-test,hier-gcn vs mlcf,0.766186,0.020555,0.81694,0.013504,-5.406883,0.005666,0.016999,True


In [25]:
args.task = 'acsa'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8314,0.7748,0.8348
2,0.7488,0.7871,0.8256
3,0.8365,0.7951,0.8226
4,0.8479,0.8316,0.8659
5,0.7157,0.7568,0.8331


Unnamed: 0,W,pval,normal
1000,0.833497,0.147732,True


Unnamed: 0,W,pval,normal
500,0.964037,0.835776,True


Unnamed: 0,W,pval,normal
full,0.807002,0.092301,True


    split prompt      f1
0       1   1000  0.8314
1       2   1000  0.7488
2       3   1000  0.8365
3       4   1000  0.8479
4       5   1000  0.7157
5       1    500  0.7748
6       2    500  0.7871
7       3    500  0.7951
8       4    500  0.8316
9       5    500  0.7568
10      1   full  0.8348
11      2   full  0.8256
12      3   full  0.8226
13      4   full  0.8659
14      5   full  0.8331
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.590351  0.077125  0.260254  0.588467
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.79606,0.05341,0.78908,0.024876,0.346931,0.746127,0.746127,False
1,t-test,1000 vs full,0.79606,0.05341,0.8364,0.015432,-1.641874,0.175961,0.351921,False
2,t-test,500 vs full,0.78908,0.024876,0.8364,0.015432,-5.224397,0.006408,0.019225,True


Unnamed: 0,1000,500,full
1,0.7744,0.8249,0.8439
2,0.7457,0.7988,0.7776
3,0.7368,0.7935,0.8356
4,0.7973,0.8387,0.865
5,0.7579,0.7496,0.8387


Unnamed: 0,W,pval,normal
1000,0.959187,0.802308,True


Unnamed: 0,W,pval,normal
500,0.950268,0.7391,True


Unnamed: 0,W,pval,normal
full,0.850142,0.194967,True


    split prompt      f1
0       1   1000  0.7744
1       2   1000  0.7457
2       3   1000  0.7368
3       4   1000  0.7973
4       5   1000  0.7579
5       1    500  0.8249
6       2    500  0.7988
7       3    500  0.7935
8       4    500  0.8387
9       5    500  0.7496
10      1   full  0.8439
11      2   full  0.7776
12      3   full  0.8356
13      4   full  0.8650
14      5   full  0.8387
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  12.542534  0.003418  0.520195  0.706028
Results for LR-Comparison of :  long


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.76242,0.021524,0.8011,0.030645,-3.219486,0.032296,0.064593,False
1,t-test,1000 vs full,0.76242,0.021524,0.83216,0.029147,-6.36253,0.003128,0.009385,True
2,t-test,500 vs full,0.8011,0.030645,0.83216,0.029147,-1.737701,0.15726,0.15726,False


Unnamed: 0,1000,500,full
1,0.773,0.8168,0.8532
2,0.802,0.7859,0.8123
3,0.802,0.827,0.8187
4,0.8299,0.8436,0.8234
5,0.8216,0.8182,0.8194


Unnamed: 0,W,pval,normal
1000,0.935542,0.634617,True


Unnamed: 0,W,pval,normal
500,0.942179,0.681379,True


Unnamed: 0,W,pval,normal
full,0.779059,0.054104,True


    split prompt      f1
0       1   1000  0.7730
1       2   1000  0.8020
2       3   1000  0.8020
3       4   1000  0.8299
4       5   1000  0.8216
5       1    500  0.8168
6       2    500  0.7859
7       3    500  0.8270
8       4    500  0.8436
9       5    500  0.8182
10      1   full  0.8532
11      2   full  0.8123
12      3   full  0.8187
13      4   full  0.8234
14      5   full  0.8194
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  1.273475  0.331018  0.173793  0.75167
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8057,0.019673,0.8183,0.018803,-1.199157,0.296645,0.832851,False
1,t-test,1000 vs full,0.8057,0.019673,0.8254,0.014348,-1.255559,0.277617,0.832851,False
2,t-test,500 vs full,0.8183,0.018803,0.8254,0.014348,-0.669656,0.539749,0.832851,False


Unnamed: 0,1000,500,full
1,0.773,0.8168,0.8348
2,0.802,0.7859,0.8256
3,0.802,0.827,0.8226
4,0.8299,0.8436,0.8659
5,0.8216,0.8182,0.8331


Unnamed: 0,W,pval,normal
1000,0.935542,0.634617,True


Unnamed: 0,W,pval,normal
500,0.942179,0.681379,True


Unnamed: 0,W,pval,normal
full,0.807002,0.092301,True


    split prompt      f1
0       1   1000  0.7730
1       2   1000  0.8020
2       3   1000  0.8020
3       4   1000  0.8299
4       5   1000  0.8216
5       1    500  0.8168
6       2    500  0.7859
7       3    500  0.8270
8       4    500  0.8436
9       5    500  0.8182
10      1   full  0.8348
11      2   full  0.8256
12      3   full  0.8226
13      4   full  0.8659
14      5   full  0.8331
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      2      8  6.04788  0.025116  0.327339  0.835234
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8057,0.019673,0.8183,0.018803,-1.199157,0.296645,0.296645,False
1,t-test,1000 vs full,0.8057,0.019673,0.8364,0.015432,-3.525756,0.024323,0.07297,False
2,t-test,500 vs full,0.8183,0.018803,0.8364,0.015432,-2.558831,0.062716,0.125431,False


## E2E

In [26]:
runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# InstructABSA
METHOD = 'instructAbsa'
RESULTS_PATH = '../../../ABSA-Baselines/InstructABSA-Custom/Output_ref'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

file_names = [file for file in os.listdir(RESULTS_PATH) if len(file.split('.tsv')) > 1 and file != '.ipynb_checkpoints']

for file_name in file_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, file_name), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = file_name.split('.tsv')[0]
    cond_parameters = cond_name.split('_')
    
    cond_parameters.append(metrics_dict['F1-Score'])
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method
    cond_parameters.insert(6, 8)       # Batch Size

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'
    
    runs.append(cond_parameters)


# TAS-BERT

METHOD = 'tas-bert'
RESULTS_PATH = '../../../ABSA-Baselines/TAS-BERT-Custom/results/GERestaurant/three_joint/BIO'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [file for file in os.listdir(RESULTS_PATH) if file != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.append(float(f1))
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [27]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
34,e2e,instructAbsa,GERestaurant,0,5,5e-05,8,4.0,0.73706,,
42,e2e,tas-bert,GERestaurant,0,3,2e-05,24,30.0,0.7271,,
45,e2e,tas-bert,GERestaurant,0,5,2e-05,24,30.0,0.7253,,
15,e2e,instructAbsa,GERestaurant,0,4,5e-05,8,4.0,0.721713,,
41,e2e,tas-bert,GERestaurant,0,4,2e-05,24,30.0,0.7163,,
35,e2e,instructAbsa,GERestaurant,0,2,5e-05,8,4.0,0.713026,,
12,e2e,instructAbsa,GERestaurant,0,1,5e-05,8,4.0,0.707113,,
38,e2e,instructAbsa,GERestaurant,0,3,5e-05,8,4.0,0.695992,,
54,e2e,tas-bert,GERestaurant,0,2,2e-05,24,30.0,0.6944,,
51,e2e,tas-bert,GERestaurant,0,1,2e-05,24,30.0,0.6896,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.7923,0.7876,0.7668,0.707113,0.6896
2,0.7865,0.75,0.7689,0.713026,0.6944
3,0.8,0.7861,0.765,0.695992,0.7271
4,0.8335,0.8187,0.7888,0.721713,0.7163
5,0.8178,0.8004,0.7719,0.73706,0.7253


Unnamed: 0,W,pval,normal
short,0.9325,0.613512,True


Unnamed: 0,W,pval,normal
instructAbsa,0.99032,0.980802,True


Unnamed: 0,W,pval,normal
tas-bert,0.856518,0.216014,True


    split        prompt        f1
0       1         short  0.792300
1       2         short  0.786500
2       3         short  0.800000
3       4         short  0.833500
4       5         short  0.817800
5       1  instructAbsa  0.707113
6       2  instructAbsa  0.713026
7       3  instructAbsa  0.695992
8       4  instructAbsa  0.721713
9       5  instructAbsa  0.737060
10      1      tas-bert  0.689600
11      2      tas-bert  0.694400
12      3      tas-bert  0.727100
13      4      tas-bert  0.716300
14      5      tas-bert  0.725300
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  91.90187  0.000003  0.887457  0.897836


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.80602,0.017317,0.714981,0.01385,12.579285,0.00023,0.000579,True
1,t-test,short vs tas-bert,0.80602,0.017317,0.71054,0.015648,13.152164,0.000193,0.000579,True
2,t-test,instructAbsa vs tas-bert,0.714981,0.01385,0.71054,0.015648,0.483051,0.654308,0.654308,False


### 1000

In [28]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
23,e2e,instructAbsa,GERestaurant,1000,4,5e-05,8,9.0,0.713258,,
25,e2e,instructAbsa,GERestaurant,1000,5,5e-05,8,9.0,0.707946,,
10,e2e,instructAbsa,GERestaurant,1000,3,5e-05,8,9.0,0.700389,,
37,e2e,instructAbsa,GERestaurant,1000,1,5e-05,8,9.0,0.688935,,
28,e2e,instructAbsa,GERestaurant,1000,2,5e-05,8,9.0,0.687685,,
50,e2e,tas-bert,GERestaurant,1000,4,2e-05,24,13.0,0.6725,,
44,e2e,tas-bert,GERestaurant,1000,2,2e-05,24,13.0,0.6716,,
46,e2e,tas-bert,GERestaurant,1000,3,2e-05,24,13.0,0.6708,,
49,e2e,tas-bert,GERestaurant,1000,5,2e-05,24,13.0,0.6624,,
52,e2e,tas-bert,GERestaurant,1000,1,2e-05,24,13.0,0.6555,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.7992,0.7785,0.7288,0.688935,0.6555
2,0.7455,0.7703,0.7223,0.687685,0.6716
3,0.7953,0.7758,0.7618,0.700389,0.6708
4,0.8136,0.8069,0.7614,0.713258,0.6725
5,0.7681,0.7893,0.7713,0.707946,0.6624


Unnamed: 0,W,pval,normal
short,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
instructAbsa,0.909094,0.462164,True


Unnamed: 0,W,pval,normal
tas-bert,0.83729,0.157568,True


    split        prompt        f1
0       1         short  0.799200
1       2         short  0.745500
2       3         short  0.795300
3       4         short  0.813600
4       5         short  0.768100
5       1  instructAbsa  0.688935
6       2  instructAbsa  0.687685
7       3  instructAbsa  0.700389
8       4  instructAbsa  0.713258
9       5  instructAbsa  0.707946
10      1      tas-bert  0.655500
11      2      tas-bert  0.671600
12      3      tas-bert  0.670800
13      4      tas-bert  0.672500
14      5      tas-bert  0.662400
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  71.703176  0.000008  0.908864  0.609305


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.78434,0.02437,0.699643,0.010124,7.850759,0.001422,0.002844,True
1,t-test,short vs tas-bert,0.78434,0.02437,0.66656,0.006607,9.12474,0.0008,0.002401,True
2,t-test,instructAbsa vs tas-bert,0.699643,0.010124,0.66656,0.006607,6.516226,0.002863,0.002863,True


### 500

In [29]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
13,e2e,instructAbsa,GERestaurant,500,3,5e-05,8,17.0,0.689792,,
8,e2e,instructAbsa,GERestaurant,500,4,5e-05,8,17.0,0.683417,,
26,e2e,instructAbsa,GERestaurant,500,1,5e-05,8,17.0,0.681818,,
30,e2e,instructAbsa,GERestaurant,500,5,5e-05,8,17.0,0.672165,,
6,e2e,instructAbsa,GERestaurant,500,2,5e-05,8,17.0,0.669941,,
40,e2e,tas-bert,GERestaurant,500,3,2e-05,24,24.0,0.6164,,
53,e2e,tas-bert,GERestaurant,500,4,2e-05,24,24.0,0.6131,,
57,e2e,tas-bert,GERestaurant,500,5,2e-05,24,24.0,0.6087,,
47,e2e,tas-bert,GERestaurant,500,1,2e-05,24,24.0,0.6072,,
55,e2e,tas-bert,GERestaurant,500,2,2e-05,24,24.0,0.5978,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.7458,0.7271,0.6693,0.681818,0.6072
2,0.7606,0.7129,0.7179,0.669941,0.5978
3,0.6998,0.6775,0.6729,0.689792,0.6164
4,0.7563,0.7069,0.7082,0.683417,0.6131
5,0.7301,0.7094,0.7154,0.672165,0.6087


Unnamed: 0,W,pval,normal
short,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
instructAbsa,0.931274,0.605081,True


Unnamed: 0,W,pval,normal
tas-bert,0.951778,0.749884,True


    split        prompt        f1
0       1         short  0.745800
1       2         short  0.760600
2       3         short  0.699800
3       4         short  0.756300
4       5         short  0.730100
5       1  instructAbsa  0.681818
6       2  instructAbsa  0.669941
7       3  instructAbsa  0.689792
8       4  instructAbsa  0.683417
9       5  instructAbsa  0.672165
10      1      tas-bert  0.607200
11      2      tas-bert  0.597800
12      3      tas-bert  0.616400
13      4      tas-bert  0.613100
14      5      tas-bert  0.608700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  69.819606  0.000009  0.935857  0.516132


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.73852,0.022031,0.679427,0.007373,4.391686,0.011767,0.011767,True
1,t-test,short vs tas-bert,0.73852,0.022031,0.60864,0.00632,9.721971,0.000627,0.001254,True
2,t-test,instructAbsa vs tas-bert,0.679427,0.007373,0.60864,0.00632,36.045778,4e-06,1.1e-05,True


In [30]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7992,0.7458,0.7923
2,0.7455,0.7606,0.7865
3,0.7953,0.6998,0.8
4,0.8136,0.7563,0.8335
5,0.7681,0.7301,0.8178


Unnamed: 0,W,pval,normal
1000,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
500,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
full,0.9325,0.613512,True


    split prompt      f1
0       1   1000  0.7992
1       2   1000  0.7455
2       3   1000  0.7953
3       4   1000  0.8136
4       5   1000  0.7681
5       1    500  0.7458
6       2    500  0.7606
7       3    500  0.6998
8       4    500  0.7563
9       5    500  0.7301
10      1   full  0.7923
11      2   full  0.7865
12      3   full  0.8000
13      4   full  0.8335
14      5   full  0.8178
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  11.456719  0.004485  0.632663  0.743232


  W = np.prod(eig) / (eig.sum() / d) ** d


Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78434,0.02437,0.73852,0.022031,2.555484,0.062938,0.125877,False
1,t-test,1000 vs full,0.78434,0.02437,0.80602,0.017317,-2.036472,0.11139,0.125877,False
2,t-test,500 vs full,0.73852,0.022031,0.80602,0.017317,-4.934124,0.00785,0.023549,True


Unnamed: 0,1000,500,full
1,0.7785,0.7271,0.7876
2,0.7703,0.7129,0.75
3,0.7758,0.6775,0.7861
4,0.8069,0.7069,0.8187
5,0.7893,0.7094,0.8004


Unnamed: 0,W,pval,normal
1000,0.908464,0.458414,True


Unnamed: 0,W,pval,normal
500,0.900947,0.415142,True


Unnamed: 0,W,pval,normal
full,0.948692,0.727844,True


    split prompt      f1
0       1   1000  0.7785
1       2   1000  0.7703
2       3   1000  0.7758
3       4   1000  0.8069
4       5   1000  0.7893
5       1    500  0.7271
6       2    500  0.7129
7       3    500  0.6775
8       4    500  0.7069
9       5    500  0.7094
10      1   full  0.7876
11      2   full  0.7500
12      3   full  0.7861
13      4   full  0.8187
14      5   full  0.8004
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  36.544787  0.000095  0.818487  0.578683
Results for LR-Comparison of :  long


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78416,0.012943,0.70676,0.016215,7.682936,0.001544,0.004631,True
1,t-test,1000 vs full,0.78416,0.012943,0.78856,0.02255,-0.710676,0.516527,0.516527,False
2,t-test,500 vs full,0.70676,0.016215,0.78856,0.02255,-5.675453,0.004756,0.009511,True


Unnamed: 0,1000,500,full
1,0.7288,0.6693,0.7668
2,0.7223,0.7179,0.7689
3,0.7618,0.6729,0.765
4,0.7614,0.7082,0.7888
5,0.7713,0.7154,0.7719


Unnamed: 0,W,pval,normal
1000,0.848695,0.190429,True


Unnamed: 0,W,pval,normal
500,0.80553,0.089826,True


Unnamed: 0,W,pval,normal
full,0.79095,0.068237,True


    split prompt      f1
0       1   1000  0.7288
1       2   1000  0.7223
2       3   1000  0.7618
3       4   1000  0.7614
4       5   1000  0.7713
5       1    500  0.6693
6       2    500  0.7179
7       3    500  0.6729
8       4    500  0.7082
9       5    500  0.7154
10      1   full  0.7668
11      2   full  0.7689
12      3   full  0.7650
13      4   full  0.7888
14      5   full  0.7719


  W = np.prod(eig) / (eig.sum() / d) ** d


Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2       eps
0  prompt      2      8  25.16706  0.000354  0.76692  0.764978
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.74912,0.019676,0.69674,0.021206,3.851289,0.018282,0.036565,True
1,t-test,1000 vs full,0.74912,0.019676,0.77228,0.008573,-2.515763,0.065653,0.065653,False
2,t-test,500 vs full,0.69674,0.021206,0.77228,0.008573,-8.082931,0.001273,0.003819,True


Unnamed: 0,1000,500,full
1,0.7992,0.7458,0.7923
2,0.7455,0.7606,0.7865
3,0.7953,0.6998,0.8
4,0.8136,0.7563,0.8335
5,0.7681,0.7301,0.8178


Unnamed: 0,W,pval,normal
1000,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
500,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
full,0.9325,0.613512,True


    split prompt      f1
0       1   1000  0.7992
1       2   1000  0.7455
2       3   1000  0.7953
3       4   1000  0.8136
4       5   1000  0.7681
5       1    500  0.7458
6       2    500  0.7606
7       3    500  0.6998
8       4    500  0.7563
9       5    500  0.7301
10      1   full  0.7923
11      2   full  0.7865
12      3   full  0.8000
13      4   full  0.8335
14      5   full  0.8178
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  11.456719  0.004485  0.632663  0.743232
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78434,0.02437,0.73852,0.022031,2.555484,0.062938,0.125877,False
1,t-test,1000 vs full,0.78434,0.02437,0.80602,0.017317,-2.036472,0.11139,0.125877,False
2,t-test,500 vs full,0.73852,0.022031,0.80602,0.017317,-4.934124,0.00785,0.023549,True


## E2E - without Implicit

In [31]:
runs = []
RESULTS_PATH = '../results_final/filtered/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# InstructABSA
METHOD = 'instructAbsa'
RESULTS_PATH = '../../../ABSA-Baselines/InstructABSA-Custom/Output_filtered'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

file_names = [file for file in os.listdir(RESULTS_PATH) if len(file.split('.tsv')) > 1 and file != '.ipynb_checkpoints']

for file_name in file_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, file_name), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = file_name.split('.tsv')[0]
    cond_parameters = cond_name.split('_')
    
    cond_parameters.append(metrics_dict['F1-Score'])
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method
    cond_parameters.insert(6, 8)       # Batch Size

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'
    
    runs.append(cond_parameters)


# TAS-BERT

METHOD = 'tas-bert'
RESULTS_PATH = '../../../ABSA-Baselines/TAS-BERT-Custom/results_filtered/GERestaurant/three_joint/BIO'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [file for file in os.listdir(RESULTS_PATH) if file != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.append(float(f1))
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [32]:
args.lr_setting = 0
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
54,e2e,tas-bert,GERestaurant,0,3,2e-05,24,25.0,0.7708,,
40,e2e,tas-bert,GERestaurant,0,5,2e-05,24,25.0,0.7427,,
51,e2e,tas-bert,GERestaurant,0,1,2e-05,24,25.0,0.7346,,
48,e2e,tas-bert,GERestaurant,0,2,2e-05,24,25.0,0.7211,,
57,e2e,tas-bert,GERestaurant,0,4,2e-05,24,25.0,0.6926,,
38,e2e,instructAbsa,GERestaurant,0,3,5e-05,8,4.0,0.623529,,
35,e2e,instructAbsa,GERestaurant,0,2,5e-05,8,4.0,0.6141,,
34,e2e,instructAbsa,GERestaurant,0,5,5e-05,8,4.0,0.601457,,
15,e2e,instructAbsa,GERestaurant,0,4,5e-05,8,4.0,0.592284,,
12,e2e,instructAbsa,GERestaurant,0,1,5e-05,8,4.0,0.570265,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.8237,0.8284,0.8173,0.570265,0.7346
2,0.7497,0.6943,0.7918,0.6141,0.7211
3,0.8431,0.8365,0.852,0.623529,0.7708
4,0.7857,0.8154,0.7652,0.592284,0.6926
5,0.8039,0.7256,0.8055,0.601457,0.7427


Unnamed: 0,W,pval,normal
cot,0.991887,0.985867,True


Unnamed: 0,W,pval,normal
instructAbsa,0.974128,0.901028,True


Unnamed: 0,W,pval,normal
tas-bert,0.991894,0.98589,True


    split        prompt        f1
0       1           cot  0.817300
1       2           cot  0.791800
2       3           cot  0.852000
3       4           cot  0.765200
4       5           cot  0.805500
5       1  instructAbsa  0.570265
6       2  instructAbsa  0.614100
7       3  instructAbsa  0.623529
8       4  instructAbsa  0.592284
9       5  instructAbsa  0.601457
10      1      tas-bert  0.734600
11      2      tas-bert  0.721100
12      3      tas-bert  0.770800
13      4      tas-bert  0.692600
14      5      tas-bert  0.742700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F         p-unc       ng2       eps
0  prompt      2      8  177.919903  2.337325e-07  0.922866  0.537248


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs instructAbsa,0.80636,0.028669,0.600327,0.018427,14.410956,0.000135,0.00027,True
1,t-test,cot vs tas-bert,0.80636,0.028669,0.73236,0.02568,20.297983,3.5e-05,0.000104,True
2,t-test,instructAbsa vs tas-bert,0.600327,0.018427,0.73236,0.02568,-10.79296,0.000418,0.000418,True


### 1000

In [33]:
args.lr_setting = 1000
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
46,e2e,tas-bert,GERestaurant,1000,3,2e-05,24,25.0,0.7644,,
44,e2e,tas-bert,GERestaurant,1000,2,2e-05,24,25.0,0.7173,,
47,e2e,tas-bert,GERestaurant,1000,5,2e-05,24,25.0,0.7087,,
42,e2e,tas-bert,GERestaurant,1000,1,2e-05,24,25.0,0.7064,,
52,e2e,tas-bert,GERestaurant,1000,4,2e-05,24,25.0,0.6827,,
25,e2e,instructAbsa,GERestaurant,1000,5,5e-05,8,9.0,0.67364,,
28,e2e,instructAbsa,GERestaurant,1000,2,5e-05,8,9.0,0.656834,,
10,e2e,instructAbsa,GERestaurant,1000,3,5e-05,8,9.0,0.630435,,
37,e2e,instructAbsa,GERestaurant,1000,1,5e-05,8,9.0,0.623053,,
23,e2e,instructAbsa,GERestaurant,1000,4,5e-05,8,9.0,0.617464,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.804,0.8152,0.8085,0.623053,0.7064
2,0.8161,0.6942,0.8241,0.656834,0.7173
3,0.8259,0.8583,0.8108,0.630435,0.7644
4,0.7888,0.7506,0.7668,0.617464,0.6827
5,0.7688,0.7923,0.7867,0.67364,0.7087


Unnamed: 0,W,pval,normal
short,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
instructAbsa,0.899257,0.405792,True


Unnamed: 0,W,pval,normal
tas-bert,0.900709,0.413814,True


    split        prompt        f1
0       1         short  0.804000
1       2         short  0.816100
2       3         short  0.825900
3       4         short  0.788800
4       5         short  0.768800
5       1  instructAbsa  0.623053
6       2  instructAbsa  0.656834
7       3  instructAbsa  0.630435
8       4  instructAbsa  0.617464
9       5  instructAbsa  0.673640
10      1      tas-bert  0.706400
11      2      tas-bert  0.717300
12      3      tas-bert  0.764400
13      4      tas-bert  0.682700
14      5      tas-bert  0.708700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  57.508952  0.000018  0.890244  0.757303


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.80072,0.02021,0.640285,0.021453,9.240477,0.000762,0.002287,True
1,t-test,short vs tas-bert,0.80072,0.02021,0.7159,0.026827,8.553924,0.001025,0.002287,True
2,t-test,instructAbsa vs tas-bert,0.640285,0.021453,0.7159,0.026827,-4.58211,0.010167,0.010167,True


### 500

In [34]:
args.lr_setting = 500
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
53,e2e,tas-bert,GERestaurant,500,3,2e-05,24,19.0,0.7168,,
49,e2e,tas-bert,GERestaurant,500,1,2e-05,24,19.0,0.6741,,
55,e2e,tas-bert,GERestaurant,500,5,2e-05,24,19.0,0.6667,,
43,e2e,tas-bert,GERestaurant,500,2,2e-05,24,19.0,0.6628,,
45,e2e,tas-bert,GERestaurant,500,4,2e-05,24,19.0,0.6344,,
13,e2e,instructAbsa,GERestaurant,500,3,5e-05,8,17.0,0.618537,,
6,e2e,instructAbsa,GERestaurant,500,2,5e-05,8,17.0,0.616132,,
8,e2e,instructAbsa,GERestaurant,500,4,5e-05,8,17.0,0.609582,,
30,e2e,instructAbsa,GERestaurant,500,5,5e-05,8,17.0,0.593588,,
26,e2e,instructAbsa,GERestaurant,500,1,5e-05,8,17.0,0.567318,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.7968,0.7556,0.7626,0.567318,0.6741
2,0.7248,0.7984,0.7737,0.616132,0.6628
3,0.8251,0.8159,0.7947,0.618537,0.7168
4,0.7984,0.7291,0.7694,0.609582,0.6344
5,0.7431,0.7102,0.7796,0.593588,0.6667


Unnamed: 0,W,pval,normal
short,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
instructAbsa,0.866267,0.251635,True


Unnamed: 0,W,pval,normal
tas-bert,0.935736,0.635974,True


    split        prompt        f1
0       1         short  0.796800
1       2         short  0.724800
2       3         short  0.825100
3       4         short  0.798400
4       5         short  0.743100
5       1  instructAbsa  0.567318
6       2  instructAbsa  0.616132
7       3  instructAbsa  0.618537
8       4  instructAbsa  0.609582
9       5  instructAbsa  0.593588
10      1      tas-bert  0.674100
11      2      tas-bert  0.662800
12      3      tas-bert  0.716800
13      4      tas-bert  0.634400
14      5      tas-bert  0.666700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  46.536437  0.000039  0.864789  0.873377


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.77764,0.037512,0.601031,0.018972,8.237915,0.001184,0.003552,True
1,t-test,short vs tas-bert,0.77764,0.037512,0.67096,0.02658,5.93674,0.004036,0.008072,True
2,t-test,instructAbsa vs tas-bert,0.601031,0.018972,0.67096,0.02658,-4.538892,0.010506,0.010506,True


In [35]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.804,0.7968,0.8237
2,0.8161,0.7248,0.7497
3,0.8259,0.8251,0.8431
4,0.7888,0.7984,0.7857
5,0.7688,0.7431,0.8039


Unnamed: 0,W,pval,normal
1000,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
500,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
full,0.983488,0.952373,True


    split prompt      f1
0       1   1000  0.8040
1       2   1000  0.8161
2       3   1000  0.8259
3       4   1000  0.7888
4       5   1000  0.7688
5       1    500  0.7968
6       2    500  0.7248
7       3    500  0.8251
8       4    500  0.7984
9       5    500  0.7431
10      1   full  0.8237
11      2   full  0.7497
12      3   full  0.8431
13      4   full  0.7857
14      5   full  0.8039


  W = np.prod(eig) / (eig.sum() / d) ** d


Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.400032  0.301061  0.113048  0.818557
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.80072,0.02021,0.77764,0.037512,1.282481,0.268951,0.537901,False
1,t-test,1000 vs full,0.80072,0.02021,0.80122,0.03213,-0.028098,0.97893,0.97893,False
2,t-test,500 vs full,0.77764,0.037512,0.80122,0.03213,-2.012568,0.114463,0.34339,False


Unnamed: 0,1000,500,full
1,0.8152,0.7556,0.8284
2,0.6942,0.7984,0.6943
3,0.8583,0.8159,0.8365
4,0.7506,0.7291,0.8154
5,0.7923,0.7102,0.7256


Unnamed: 0,W,pval,normal
1000,0.988245,0.973192,True


Unnamed: 0,W,pval,normal
500,0.941347,0.675478,True


Unnamed: 0,W,pval,normal
full,0.833036,0.146574,True


    split prompt      f1
0       1   1000  0.8152
1       2   1000  0.6942
2       3   1000  0.8583
3       4   1000  0.7506
4       5   1000  0.7923
5       1    500  0.7556
6       2    500  0.7984
7       3    500  0.8159
8       4    500  0.7291
9       5    500  0.7102
10      1   full  0.8284
11      2   full  0.6943
12      3   full  0.8365
13      4   full  0.8154
14      5   full  0.7256
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  0.280667  0.76242  0.029568  0.813498
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78212,0.05608,0.76184,0.040092,0.620648,0.568446,1.0,False
1,t-test,1000 vs full,0.78212,0.05608,0.78004,0.058467,0.096583,0.927703,1.0,False
2,t-test,500 vs full,0.76184,0.040092,0.78004,0.058467,-0.541541,0.616888,1.0,False


Unnamed: 0,1000,500,full
1,0.8085,0.7626,0.8173
2,0.8241,0.7737,0.7918
3,0.8108,0.7947,0.852
4,0.7668,0.7694,0.7652
5,0.7867,0.7796,0.8055


Unnamed: 0,W,pval,normal
1000,0.941549,0.676906,True


Unnamed: 0,W,pval,normal
500,0.958088,0.794608,True


Unnamed: 0,W,pval,normal
full,0.991887,0.985867,True


    split prompt      f1
0       1   1000  0.8085
1       2   1000  0.8241
2       3   1000  0.8108
3       4   1000  0.7668
4       5   1000  0.7867
5       1    500  0.7626
6       2    500  0.7737
7       3    500  0.7947
8       4    500  0.7694
9       5    500  0.7796
10      1   full  0.8173
11      2   full  0.7918
12      3   full  0.8520
13      4   full  0.7652
14      5   full  0.8055
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  3.868317  0.06679  0.272581  0.975992
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.79938,0.020233,0.776,0.010873,2.214037,0.091215,0.182431,False
1,t-test,1000 vs full,0.79938,0.020233,0.80636,0.028669,-0.576626,0.595087,0.595087,False
2,t-test,500 vs full,0.776,0.010873,0.80636,0.028669,-2.621339,0.058717,0.17615,False


Unnamed: 0,1000,500,full
1,0.804,0.7968,0.8173
2,0.8161,0.7248,0.7918
3,0.8259,0.8251,0.852
4,0.7888,0.7984,0.7652
5,0.7688,0.7431,0.8055


Unnamed: 0,W,pval,normal
1000,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
500,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
full,0.991887,0.985867,True


    split prompt      f1
0       1   1000  0.8040
1       2   1000  0.8161
2       3   1000  0.8259
3       4   1000  0.7888
4       5   1000  0.7688
5       1    500  0.7968
6       2    500  0.7248
7       3    500  0.8251
8       4    500  0.7984
9       5    500  0.7431
10      1   full  0.8173
11      2   full  0.7918
12      3   full  0.8520
13      4   full  0.7652
14      5   full  0.8055
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.717831  0.239505  0.149361  0.857445
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.80072,0.02021,0.77764,0.037512,1.282481,0.268951,0.559451,False
1,t-test,1000 vs full,0.80072,0.02021,0.80636,0.028669,-0.446342,0.678447,0.678447,False
2,t-test,500 vs full,0.77764,0.037512,0.80636,0.028669,-1.592527,0.186484,0.559451,False


## ACSD

In [36]:
runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_phrases.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# Paraphrase Generation
METHOD = 'para'
RESULTS_PATH = ''

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_phrases.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        # Fix for the wrong output name format
        # cond_parameters[0], cond_parameters[1] = cond_parameters[1], cond_parameters[0]
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        runs.append(cond_parameters)
    except:
        pass

# E2TP 
METHOD = 'e2tp'
RESULTS_PATH = '../../../ABSA-Baselines/E2TP-custom/src/results_ref'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.tsv'), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = folder_name.split('/')[-1]
    cond_parameters = cond_name.split('_')

    cond_parameters.append(metrics_dict['f1']/100)
    cond_parameters.extend([None,None])
    cond_parameters[1:1] = [METHOD]
    cond_parameters[3] = 0 if cond_parameters[3] == 'full' else cond_parameters[3]
    
    runs.append(cond_parameters)
    # except:
    #     pass

results_baseline = pd.DataFrame(runs, columns = col_names)
results_baseline['lr-setting'] = results_baseline['lr-setting'].astype(str)

args.results = results_all
args.results_baseline = results_baseline

stats_dfs = {}

### Full Dataset

In [37]:
args.lr_setting = 0
args.task = 'acsd'

stats_acsd['0'] = computePromptStatistics(args)
stats_acsd['0']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
9,acsd,para,GERestaurant,0,4,0.0003,16,20,0.7342,0.7304,0.58
49,acsd,e2tp,GERestaurant,0,4,0.0001,8,20,0.714434,,
10,acsd,para,GERestaurant,0,5,0.0003,16,20,0.7043,0.6538,0.5436
7,acsd,para,GERestaurant,0,3,0.0003,16,20,0.7028,0.6728,0.5417
57,acsd,e2tp,GERestaurant,0,3,0.0001,8,20,0.702,,
46,acsd,e2tp,GERestaurant,0,5,0.0001,8,20,0.698947,,
37,acsd,para,GERestaurant,0,2,0.0003,16,20,0.6914,0.6669,0.5284
35,acsd,para,GERestaurant,0,1,0.0003,16,20,0.6867,0.6539,0.5229
66,acsd,e2tp,GERestaurant,0,1,0.0001,8,20,0.686378,,
58,acsd,e2tp,GERestaurant,0,2,0.0001,8,20,0.673807,,


Unnamed: 0,short,long,cot,para,e2tp
1,0.7123,0.7433,0.7502,0.6867,0.686378
2,0.7362,0.7346,0.7242,0.6914,0.673807
3,0.7672,0.7663,0.7386,0.7028,0.702
4,0.7578,0.7625,0.7365,0.7342,0.714434
5,0.7832,0.7751,0.6755,0.7043,0.698947


Unnamed: 0,W,pval,normal
long,0.931518,0.606756,True


Unnamed: 0,W,pval,normal
para,0.877092,0.29634,True


Unnamed: 0,W,pval,normal
e2tp,0.980622,0.93792,True


    split prompt        f1
0       1   long  0.743300
1       2   long  0.734600
2       3   long  0.766300
3       4   long  0.762500
4       5   long  0.775100
5       1   para  0.686700
6       2   para  0.691400
7       3   para  0.702800
8       4   para  0.734200
9       5   para  0.704300
10      1   e2tp  0.686378
11      2   e2tp  0.673807
12      3   e2tp  0.702000
13      4   e2tp  0.714434
14      5   e2tp  0.698947
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  68.768304  0.000009  0.759754  0.611354


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs para,0.75636,0.015045,0.70388,0.016562,6.936473,0.002268,0.004537,True
1,t-test,long vs e2tp,0.75636,0.015045,0.695113,0.013897,13.299554,0.000185,0.000554,True
2,t-test,para vs e2tp,0.70388,0.016562,0.695113,0.013897,2.109784,0.102515,0.102515,False


### 1000

In [38]:
args.lr_setting = 1000
args.task = 'acsd'

stats_acsd['1000'] = computePromptStatistics(args)
stats_acsd['1000']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
45,acsd,e2tp,GERestaurant,1000,5,0.0001,8,20,0.677215,,
33,acsd,para,GERestaurant,1000,3,0.0003,16,20,0.6759,0.6349,0.5104
21,acsd,para,GERestaurant,1000,5,0.0003,16,20,0.6756,0.6092,0.5101
78,acsd,e2tp,GERestaurant,1000,3,0.0001,8,20,0.674044,,
32,acsd,para,GERestaurant,1000,2,0.0003,16,20,0.67,0.657,0.5037
1,acsd,para,GERestaurant,1000,4,0.0003,16,20,0.6646,0.6748,0.4977
67,acsd,e2tp,GERestaurant,1000,4,0.0001,8,20,0.663918,,
62,acsd,e2tp,GERestaurant,1000,1,0.0001,8,20,0.661088,,
28,acsd,para,GERestaurant,1000,1,0.0003,16,20,0.6431,0.6199,0.4739
68,acsd,e2tp,GERestaurant,1000,2,0.0001,8,20,0.633466,,


Unnamed: 0,short,long,cot,para,e2tp
1,0.7067,0.7324,0.6451,0.6431,0.661088
2,0.7114,0.72,0.6805,0.67,0.633466
3,0.7405,0.7426,0.6989,0.6759,0.674044
4,0.7855,0.7792,0.7119,0.6646,0.663918
5,0.7572,0.716,0.6999,0.6756,0.677215


Unnamed: 0,W,pval,normal
short,0.937161,0.645943,True


Unnamed: 0,W,pval,normal
para,0.816553,0.109799,True


Unnamed: 0,W,pval,normal
e2tp,0.867715,0.257294,True


    split prompt        f1
0       1  short  0.706700
1       2  short  0.711400
2       3  short  0.740500
3       4  short  0.785500
4       5  short  0.757200
5       1   para  0.643100
6       2   para  0.670000
7       3   para  0.675900
8       4   para  0.664600
9       5   para  0.675600
10      1   e2tp  0.661088
11      2   e2tp  0.633466
12      3   e2tp  0.674044
13      4   e2tp  0.663918
14      5   e2tp  0.677215
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  28.53896  0.000228  0.757939  0.84533


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs para,0.74026,0.029301,0.66584,0.012103,5.612994,0.00495,0.009901,True
1,t-test,short vs e2tp,0.74026,0.029301,0.661946,0.015461,6.30546,0.003234,0.009702,True
2,t-test,para vs e2tp,0.66584,0.012103,0.661946,0.015461,0.436845,0.684771,0.684771,False


### 500

In [39]:
args.lr_setting = 500
args.task = 'acsd'

stats_acsd['500'] = computePromptStatistics(args)
stats_acsd['500']

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
11,acsd,para,GERestaurant,500,4,0.0003,16,86,0.6486,0.6585,0.48
40,acsd,para,GERestaurant,500,1,0.0003,16,86,0.6379,0.5965,0.4683
60,acsd,e2tp,GERestaurant,500,4,0.0001,8,20,0.626582,,
29,acsd,para,GERestaurant,500,3,0.0003,16,86,0.6233,0.566,0.4527
56,acsd,e2tp,GERestaurant,500,5,0.0001,8,20,0.621505,,
36,acsd,para,GERestaurant,500,5,0.0003,16,86,0.6208,0.5703,0.4502
50,acsd,e2tp,GERestaurant,500,1,0.0001,8,20,0.611345,,
48,acsd,e2tp,GERestaurant,500,2,0.0001,8,20,0.610718,,
26,acsd,para,GERestaurant,500,2,0.0003,16,86,0.6098,0.5943,0.4387
70,acsd,e2tp,GERestaurant,500,3,0.0001,8,20,0.603622,,


Unnamed: 0,short,long,cot,para,e2tp
1,0.733,0.7354,0.6502,0.6379,0.611345
2,0.7087,0.7284,0.703,0.6098,0.610718
3,0.6768,0.7221,0.6869,0.6233,0.603622
4,0.722,0.7495,0.6749,0.6486,0.626582
5,0.6932,0.71,0.7015,0.6208,0.621505


Unnamed: 0,W,pval,normal
long,0.997633,0.998378,True


Unnamed: 0,W,pval,normal
para,0.964868,0.841405,True


Unnamed: 0,W,pval,normal
e2tp,0.946037,0.708869,True


    split prompt        f1
0       1   long  0.735400
1       2   long  0.728400
2       3   long  0.722100
3       4   long  0.749500
4       5   long  0.710000
5       1   para  0.637900
6       2   para  0.609800
7       3   para  0.623300
8       4   para  0.648600
9       5   para  0.620800
10      1   e2tp  0.611345
11      2   e2tp  0.610718
12      3   e2tp  0.603622
13      4   e2tp  0.626582
14      5   e2tp  0.621505
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F         p-unc       ng2       eps
0  prompt      2      8  232.04935  8.245723e-08  0.948201  0.895203


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs para,0.72908,0.013192,0.62808,0.01362,20.921693,3.1e-05,9.3e-05,True
1,t-test,long vs e2tp,0.72908,0.013192,0.614754,0.008214,17.391627,6.4e-05,0.000128,True
2,t-test,para vs e2tp,0.62808,0.01362,0.614754,0.008214,2.267583,0.085957,0.085957,False


In [40]:
args.task = 'acsd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7067,0.733,0.7123
2,0.7114,0.7087,0.7362
3,0.7405,0.6768,0.7672
4,0.7855,0.722,0.7578
5,0.7572,0.6932,0.7832


Unnamed: 0,W,pval,normal
1000,0.937161,0.645943,True


Unnamed: 0,W,pval,normal
500,0.978466,0.926253,True


Unnamed: 0,W,pval,normal
full,0.973885,0.89955,True


    split prompt      f1
0       1   1000  0.7067
1       2   1000  0.7114
2       3   1000  0.7405
3       4   1000  0.7855
4       5   1000  0.7572
5       1    500  0.7330
6       2    500  0.7087
7       3    500  0.6768
8       4    500  0.7220
9       5    500  0.6932
10      1   full  0.7123
11      2   full  0.7362
12      3   full  0.7672
13      4   full  0.7578
14      5   full  0.7832
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.546072  0.078951  0.365519  0.698911
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.74026,0.029301,0.70674,0.02003,1.7585,0.153485,0.306971,False
1,t-test,1000 vs full,0.74026,0.029301,0.75134,0.024745,-1.059149,0.349252,0.349252,False
2,t-test,500 vs full,0.70674,0.02003,0.75134,0.024745,-2.127099,0.100535,0.301606,False


Unnamed: 0,1000,500,full
1,0.7324,0.7354,0.7433
2,0.72,0.7284,0.7346
3,0.7426,0.7221,0.7663
4,0.7792,0.7495,0.7625
5,0.716,0.71,0.7751


Unnamed: 0,W,pval,normal
1000,0.880566,0.311884,True


Unnamed: 0,W,pval,normal
500,0.997633,0.998378,True


Unnamed: 0,W,pval,normal
full,0.931518,0.606756,True


    split prompt      f1
0       1   1000  0.7324
1       2   1000  0.7200
2       3   1000  0.7426
3       4   1000  0.7792
4       5   1000  0.7160
5       1    500  0.7354
6       2    500  0.7284
7       3    500  0.7221
8       4    500  0.7495
9       5    500  0.7100
10      1   full  0.7433
11      2   full  0.7346
12      3   full  0.7663
13      4   full  0.7625
14      5   full  0.7751


  W = np.prod(eig) / (eig.sum() / d) ** d


Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.440272  0.083538  0.297767  0.766786
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.73804,0.022618,0.72908,0.013192,1.256814,0.277207,0.416486,False
1,t-test,1000 vs full,0.73804,0.022618,0.75636,0.015045,-1.499011,0.208243,0.416486,False
2,t-test,500 vs full,0.72908,0.013192,0.75636,0.015045,-2.330472,0.080211,0.240632,False


Unnamed: 0,1000,500,full
1,0.6451,0.6502,0.7502
2,0.6805,0.703,0.7242
3,0.6989,0.6869,0.7386
4,0.7119,0.6749,0.7365
5,0.6999,0.7015,0.6755


Unnamed: 0,W,pval,normal
1000,0.883812,0.326942,True


Unnamed: 0,W,pval,normal
500,0.907757,0.454228,True


Unnamed: 0,W,pval,normal
full,0.829285,0.137413,True


    split prompt      f1
0       1   1000  0.6451
1       2   1000  0.6805
2       3   1000  0.6989
3       4   1000  0.7119
4       5   1000  0.6999
5       1    500  0.6502
6       2    500  0.7030
7       3    500  0.6869
8       4    500  0.6749
9       5    500  0.7015
10      1   full  0.7502
11      2   full  0.7242
12      3   full  0.7386
13      4   full  0.7365
14      5   full  0.6755
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc     ng2       eps
0  prompt      2      8  3.263967  0.091948  0.3976  0.672939
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.68726,0.023348,0.6833,0.019486,0.399063,0.710234,0.710234,False
1,t-test,1000 vs full,0.68726,0.023348,0.725,0.026089,-1.818846,0.143069,0.357654,False
2,t-test,500 vs full,0.6833,0.019486,0.725,0.026089,-1.976943,0.119218,0.357654,False


Unnamed: 0,1000,500,full
1,0.7067,0.7354,0.7433
2,0.7114,0.7284,0.7346
3,0.7405,0.7221,0.7663
4,0.7855,0.7495,0.7625
5,0.7572,0.71,0.7751


Unnamed: 0,W,pval,normal
1000,0.937161,0.645943,True


Unnamed: 0,W,pval,normal
500,0.997633,0.998378,True


Unnamed: 0,W,pval,normal
full,0.931518,0.606756,True


    split prompt      f1
0       1   1000  0.7067
1       2   1000  0.7114
2       3   1000  0.7405
3       4   1000  0.7855
4       5   1000  0.7572
5       1    500  0.7354
6       2    500  0.7284
7       3    500  0.7221
8       4    500  0.7495
9       5    500  0.7100
10      1   full  0.7433
11      2   full  0.7346
12      3   full  0.7663
13      4   full  0.7625
14      5   full  0.7751
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2       F     p-unc       ng2       eps
0  prompt      2      8  2.4564  0.147326  0.230039  0.839269
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.74026,0.029301,0.72908,0.013192,0.758064,0.490621,0.490621,False
1,t-test,1000 vs full,0.74026,0.029301,0.75636,0.015045,-1.572398,0.190962,0.381923,False
2,t-test,500 vs full,0.72908,0.013192,0.75636,0.015045,-2.330472,0.080211,0.240632,False


## Create Latex

In [42]:
import pandas as pd

def extract_means_and_stds(stats):
    """ 
    Extract the mean and std values for two methods from the statistical dataframe.
    """
    # Initialize a dictionary to store extracted values
    results = {
        "acd": {},
        "acsa": {},
        "e2e": {},
        "acsd": {}
    }
    for task, dfs in stats.items():
        for lr_setting, df in dfs.items():
            if df is not None:
                results[task][lr_setting] = {}
                # Find the row in the dataframe that corresponds to the comparison between method1 and method2
                for _, row in df.iterrows():
                    comparison = row['comparison']
                    mean1, mean2 = row['mean 1'], row['mean 2']
                    std1, std2 = row['std 1'], row['std 2']
            
                    # Map the means and stds to the correct methods
                    method1, method2 = comparison.split(' vs ')
                    
                    if method1 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method1] = {'mean': None, 'std': None}
                        results[task][lr_setting][method1]['mean'], results[task][lr_setting][method1]['std'] = mean1*100, std1*100
                        
                    if method2 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method2] = {'mean': None, 'std': None} 
                        results[task][lr_setting][method2]['mean'], results[task][lr_setting][method2]['std'] = mean2*100, std2*100
    
    return results

def create_full_latex_row(task_results, resource_setting):
    """
    Creates a full LaTeX row for a specific resource setting across all tasks.
    
    Parameters:
    - task_results: A dictionary containing results_dicts for all tasks (e.g., {'ACD': results_dict1, 'ACSA': results_dict2, 'ACSD': results_dict3}).
    - resource_setting: The resource setting (e.g., 'Full', '1000', '500').
    
    Returns:
    - A LaTeX formatted string representing a full row of the table.
    """

    rs_text = resource_setting if resource_setting != '0' else 'Full'
    latex_row = r"\multicolumn{1}{r|}{" + rs_text + "} & "

    for task, results_dict in task_results.items():
        if task in ['acd', 'acsa']:
            task_methods = ['mlcf', 'hier-gcn']
        elif task == 'e2e':
            task_methods = ['instructAbsa', 'tas-bert']
        elif task == 'acsd':
            task_methods = ['e2tp', 'para']
            
        if resource_setting in results_dict.keys():
            methods = results_dict[resource_setting]
            # Find the method with the highest mean value among short, long, cot
            highest_prompt = [prompt_style for prompt_style in methods if prompt_style in ['short', 'long', 'cot']][0]
            highest_method = max(task_methods + [highest_prompt], key=lambda x: methods[x]['mean'])
            # Initialize LaTeX row string
            if highest_prompt == highest_method:
                latex_row += (
                    r"\scalebox{0.95}{\textbf{" + f"{methods[highest_prompt]['mean']:.2f}" + "}} & "
                )
            else:
                latex_row += (
                    f"{methods[highest_prompt]['mean']:.2f}" + " & "
                )
            
            # Add the remaining methods
            for i, method in enumerate(task_methods):
                if method in methods.keys():
                    if method == highest_method:
                        latex_row += (
                            r"\multicolumn{1}{c" + f"{'|' if(i == 1 and task != 'acsd') else ''}" +
                            r"}{\scalebox{0.95}{\textbf{" + f"{methods[method]['mean']:.2f}" +
                            r"}}} & "
                        )
                    else:
                        latex_row += (
                            r"\multicolumn{1}{c" + f"{'|' if(i == 1 and task != 'acsd') else ''}" +
                            r"}{" + f"{methods[method]['mean']:.2f}" +
                            r"} & "
                        )
                else:
                    latex_row += r"\multicolumn{1}{l|}{N/A} & "
            
    # Remove the trailing '&' and replace with '\\'
    latex_row = latex_row.rstrip(" & ") + r" \\"
    
    return latex_row
    
results_dict = extract_means_and_stds({'acd':stats_acd, 'acsa':stats_acsa, 'e2e': stats_e2e, 'acsd':stats_acsd})

latex = []
latex.append(create_full_latex_row(results_dict, '0'))
latex.append(create_full_latex_row(results_dict, '1000'))
latex.append(create_full_latex_row(results_dict, '500'))

for l in latex:
    print(l)
    print("&")


\multicolumn{1}{r|}{Full} & 87.88 & \multicolumn{1}{c}{\scalebox{0.95}{\textbf{92.29}}} & \multicolumn{1}{c|}{89.71} & \scalebox{0.95}{\textbf{83.64}} & \multicolumn{1}{c}{83.17} & \multicolumn{1}{c|}{82.49} & \scalebox{0.95}{\textbf{80.60}} & \multicolumn{1}{c}{71.50} & \multicolumn{1}{c|}{71.05} & \scalebox{0.95}{\textbf{75.64}} & \multicolumn{1}{c}{69.51} & \multicolumn{1}{c}{70.39} \\
&
\multicolumn{1}{r|}{1000} & 86.65 & \multicolumn{1}{c}{\scalebox{0.95}{\textbf{91.22}}} & \multicolumn{1}{c|}{87.41} & 80.57 & \multicolumn{1}{c}{\scalebox{0.95}{\textbf{84.36}}} & \multicolumn{1}{c|}{79.25} & \scalebox{0.95}{\textbf{78.43}} & \multicolumn{1}{c}{69.96} & \multicolumn{1}{c|}{66.66} & \scalebox{0.95}{\textbf{74.03}} & \multicolumn{1}{c}{66.19} & \multicolumn{1}{c}{66.58} \\
&
\multicolumn{1}{r|}{500} & 86.12 & \multicolumn{1}{c}{\scalebox{0.95}{\textbf{90.13}}} & \multicolumn{1}{c|}{85.88} & \scalebox{0.95}{\textbf{81.83}} & \multicolumn{1}{c}{81.69} & \multicolumn{1}{c|}{76.62} & \sc

In [38]:
results_dict

{'acd': {'0': {'short': {'mean': 87.876, 'std': 0.9352133446438845},
   'long': {'mean': 87.82799999999999, 'std': 0.8478537609753235},
   'hier-gcn': {'mean': 89.71441699450195, 'std': 0.7938397466859362},
   'mlcf': {'mean': 92.294, 'std': 0.6201806188522835}},
  '1000': {'short': {'mean': 86.64999999999999, 'std': 1.8934941246277988},
   'long': {'mean': 86.44800000000001, 'std': 1.9051761073454596},
   'hier-gcn': {'mean': 87.41157913823318, 'std': 0.98652357127297},
   'mlcf': {'mean': 91.22200000000001, 'std': 0.882142845575477}},
  '500': {'short': {'mean': 86.11999999999999, 'std': 2.0057517294022196},
   'long': {'mean': 83.23599999999999, 'std': 0.7362227923665495},
   'hier-gcn': {'mean': 85.8844883418226, 'std': 1.5575183001170727},
   'mlcf': {'mean': 90.126, 'std': 0.615844136125365}}},
 'acsa': {'0': {'short': {'mean': 83.64, 'std': 1.5432303781354233},
   'long': {'mean': 83.21600000000001, 'std': 2.9146842024480124},
   'cot': {'mean': 82.53999999999999, 'std': 1.43481

## Performance Comparison of Extraction of ABSA-Tuple Elements over different ABSA Subtasks

In [39]:
# Additional Eval

runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_asp.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'short'), ('full', 'long'), ('full', 'context'),
    ('1000', 'short'), ('1000', 'long'), ('1000', 'context'),
    ('500', 'short'), ('500', 'long'), ('500', 'context')
])

# Define the row indices
index = ['acd', 'acsa', 'acsd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'acsd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                         results_all['task'] == task, 
                                                         results_all['model_name'] == args.model,
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting,
                                                         results_all['model_lang'] == 'en'])].groupby(['model_config']):

            prompt_name = a[0].split('_')[3] if a[0].split('_')[3] != 'cot' else 'context'
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect Extraction')
display(df)

print(f"Average difference ACSA to ACD: {(np.mean([float(i) for i in list(df.loc['acsa'])]) - np.mean([float(i) for i in list(df.loc['acd']) if i != 'N/A'])):.2f}")

print(f"Average difference ACSD to ACSA: {(np.mean([float(i) for i in list(df.loc['acsd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}")

print(f"Average difference ACSD to ACD: {(np.mean([float(i) for i in list(df.loc['acsd'])]) - np.mean([float(i) for i in list(df.loc['acd'])  if i != 'N/A'])):.2f}")

Aspect Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,short,long,context,short,long,context,short,long,context
acd,87.87,87.83,,86.65,86.45,,86.12,83.24,
acsa,86.58,86.34,85.77,83.16,79.17,83.52,82.52,82.96,85.09
acsd,87.01,87.69,86.63,87.14,86.11,85.32,86.6,86.16,83.81


Average difference ACSA to ACD: -2.46
Average difference ACSD to ACSA: 2.37
Average difference ACSD to ACD: -0.09


In [40]:
# Additional Eval

runs = []
RESULTS_PATH = '../results_final/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_asp_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'short'), ('full', 'long'), ('full', 'context'),
    ('1000', 'short'), ('1000', 'long'), ('1000', 'context'),
    ('500', 'short'), ('500', 'long'), ('500', 'context')
])

# Define the row indices
index = ['acd', 'acsa', 'acsd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'acsd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                         results_all['task'] == task, 
                                                         results_all['model_name'] == args.model,
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting,
                                                         results_all['model_lang'] == 'en'])].groupby(['model_config']):

            prompt_name = a[0].split('_')[3] if a[0].split('_')[3] != 'cot' else 'context'
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect + Polarity Extraction')
display(df)

f"Average difference: {(np.mean([float(i) for i in list(df.loc['acsd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}"


Aspect + Polarity Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,short,long,context,short,long,context,short,long,context
acd,,,,,,,,,
acsa,83.64,83.22,82.54,79.61,76.24,80.57,78.91,80.11,81.83
acsd,83.75,85.1,83.46,83.89,83.27,82.25,82.34,82.96,80.19


'Average difference: 2.28'

In [10]:
# Eval for best parameter combination over all tasks and dataset sizes

RESULTS_PATH = '../results_final'
DATASET = 'GERestaurant'

col_names = ['lang', 'dataset', 'few_shot', 'prompt', 'task', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'quant', 'split', 'lr_setting', 'model', 'prompt_lang', 'prompt_few_shot', 'prompt_prompt', 'prompt_task', 'prompt_quant', 'epoch', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        filename = ''
        
        if cond_parameters[4] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[4] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[4] == 'e2e':
            filename = 'pol.tsv'
        elif cond_parameters[4] == 'acsd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

results_sub = results_all[np.logical_and.reduce([results_all['dataset'] == DATASET, results_all['split'] == '0'])].sort_values(by = ['f1-micro'], ascending = False)
results_sub = results_sub[results_sub['lr_setting'] != 'orig']
results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lr_setting', 'lora_r', 'lora_alpha', 'epoch', 'f1-micro', 'f1-macro']]
results_sub = results_sub.reset_index()

idx_max = results_sub.groupby(['lr_setting', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].idxmax()
results_per_epoch = results_sub.loc[idx_max]

results_per_epoch.groupby(['learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].mean()

learning_rate  lora_r  lora_alpha
0.0003         32      32            0.830133
                       64            0.794217
               8       16            0.838121
                       8             0.844250
3e-05          32      32            0.834604
                       64            0.842721
               8       16            0.830458
                       8             0.818429
Name: f1-micro, dtype: float64