## Language

In [186]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = SimpleNamespace(**{
    'dataset': 'GERestaurant',
})

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_tasd = {}

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub_baseline = args.results_baseline[np.logical_and.reduce([args.results_baseline['lr_setting'] == str(lr_setting), 
                                                                        args.results_baseline['dataset'] == args.dataset, 
                                                                        args.results_baseline['task'] == args.task, 
                                                                        args.results_baseline['split'] != str(0)])].sort_values(by = ['f1-micro'], ascending = False)

    display(results_sub_baseline)
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    results_sub_baseline = results_sub_baseline[['task', 'method', 'dataset', 'learning_rate', 'batch_size', 'lr_setting', 'split', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']]

    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['basic', 'context']
        baselines = ['hier_gcn', 'bert_clf']
    elif args.task == 'acsa':
        prompts = ['basic', 'context', 'cot']
        baselines = ['hier_gcn', 'bert_clf']
    elif args.task == 'e2e' or args.task == 'e2e-e':
        prompts = ['basic', 'context', 'cot']
        baselines = ['instructABSA', 'tas_bert']
    else:
        prompts = ['basic', 'context', 'cot']
        baselines = ['para', 'mvp']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,12]
            f1_prompts[prompt] = f1
        except:
            pass
    
    for method in baselines:
        f1 = {}
        # try:
        for i in range(1, 6): 
            f1[i] = results_sub_baseline[np.logical_and.reduce([results_sub_baseline['split'] == str(i), results_sub_baseline['method'] == method])].iloc[0,8]
        f1_prompts[method] = f1
        # except:
        #     pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)


    # Only use the best performing FT-LLM prompt
    available_prompts = [prompt for prompt in prompts if prompt in df_prompts.columns]

    # Calculate the average F1 scores
    avg_f1 = df_prompts[available_prompts].mean()
    
    # Find the best prompt
    best_prompt = avg_f1.idxmax()
    
    # Identify prompts to drop
    prompts_to_drop = [prompt for prompt in prompts if prompt != best_prompt]
    
    # Drop the other prompts
    df_prompts = df_prompts.drop(columns=prompts_to_drop)

    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # if not ((col1 == baselines[0] and col2 == baselines[1]) or (col1 == baselines[1] and col2 == baselines[0])):
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_prompts[col1]),
            'std 1': np.std(df_prompts[col1]),
            'mean 2': np.mean(df_prompts[col2]),
            'std 2': np.std(df_prompts[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df

def computeLowResourceStatistics(args):
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0)])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub = results_sub[['dataset', 'task', 'prompt', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]

    if args.task == 'acd':
        prompts = ['short', 'long']
    else:
        prompts = ['short', 'long', 'cot']

    for prompt in prompts:
        
        f1_splits = {}
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,12]
                f1_splits[lr_setting] = f1
            except:
                pass
        
        df_splits = pd.DataFrame(f1_splits)
        
        display(df_splits)
    
        normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}
    
        for key, item in normality_results.items():
            display(item)
        
        all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])
    
        print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        
        if all_normal:
            # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
            rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
            print("Repeated Measures ANOVA Result:")
            print(rm_anova)
        else:
            # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
            friedman = pg.friedman(df_splits)
            print("Friedman Test Result:")
            print(friedman)
    
        # Paarweise Vergleiche
        results = []
        columns = df_splits.columns
        comb = combinations(columns, 2)
        
        for col1, col2 in comb:
            # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
            if all_normal:
                test = 't-test'
                test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
                statistic = test_result['T']['T-test']
            else:
                # Falls nicht, Wilcoxon-Test
                test = 'wilcoxon'
                test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
                statistic = test_result['W-val']['Wilcoxon']
            
            result = {
                'test': test,
                'comparison': f'{col1} vs {col2}',
                'mean 1': np.mean(df_splits[col1]),
                'std 1': np.std(df_splits[col1]),
                'mean 2': np.mean(df_splits[col2]),
                'std 2': np.std(df_splits[col2]),
                'statistic': statistic,
                'p_value': test_result['p-val'].iloc[0]
            }
            results.append(result)
        
        # Erstellung eines DataFrames für die Testergebnisse
        results_df = pd.DataFrame(results)
        
        # Durchführung der Bonferroni-Holm-Korrektur
        corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
        results_df['corrected_p_value'] = corrected_p[1]
        results_df['significant'] = corrected_p[0]

        print('Results for LR-Comparison of : ', prompt)
        display(results_df)


    
    ####
    # Compute based on best performing prompt per low-resource setting
    ####


    
    f1_splits = {}
    
    for prompt in prompts:
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,14]
    
                if lr_setting not in f1_splits.keys() or np.mean(list(f1.values())) > np.mean(list(f1_splits[lr_setting].values())):
                    f1_splits[lr_setting] = f1
            except:
                pass

    df_splits = pd.DataFrame(f1_splits)
        
    display(df_splits)

    normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_splits)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_splits.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_splits[col1]),
            'std 1': np.std(df_splits[col1]),
            'mean 2': np.mean(df_splits[col2]),
            'std 2': np.std(df_splits[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]

    print('Results for LR-Comparison of best Prompt per LR-Setting')
    display(results_df)



## ACD

In [154]:
# LLM-based Method

runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

###
# Baselines
##

# Multi-label Classifiaction
METHOD = 'bert_clf'
RESULTS_PATH = '../results/'

# col_names = ['task', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_parameters = folder_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier_gcn'
RESULTS_PATH = '../results/'

col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']


folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    cond_parameters = folder_name.split('_')
    cond_params = cond_parameters.copy()
    if cond_params[0] == 'acd':
        with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'cate_eval_results.txt'), 'r') as f:
            f1 = f.readlines()[3].split(' = ')[1]
                
        cond_params[1:1] = [METHOD]
        cond_params.extend([round(float(f1), 4), None, None])
        runs.append(cond_params)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [88]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
114,acd,bert_clf,GERestaurant,0,2,2e-05,16,3.0,0.9326,0.9189,0.8737
51,acd,bert_clf,GERestaurant,0,4,2e-05,16,3.0,0.9264,0.9183,0.8628
139,acd,bert_clf,GERestaurant,0,3,2e-05,16,3.0,0.9224,0.9094,0.8559
3,acd,bert_clf,GERestaurant,0,5,2e-05,16,3.0,0.9186,0.8959,0.8495
117,acd,bert_clf,GERestaurant,0,1,2e-05,16,3.0,0.9147,0.9076,0.8429
160,acd,hier_gcn,GERestaurant,0,2,5e-05,8,20.0,0.9107,,
171,acd,hier_gcn,GERestaurant,0,5,5e-05,8,20.0,0.9011,,
176,acd,hier_gcn,GERestaurant,0,3,5e-05,8,20.0,0.8932,,
167,acd,hier_gcn,GERestaurant,0,4,5e-05,8,20.0,0.892,,
168,acd,hier_gcn,GERestaurant,0,1,5e-05,8,20.0,0.8886,,


Unnamed: 0,short,long,hier_gcn,bert_clf
1,0.8748,0.8801,0.8886,0.9147
2,0.8663,0.8698,0.9107,0.9326
3,0.8757,0.8674,0.8932,0.9224
4,0.8943,0.8895,0.892,0.9264
5,0.8827,0.8846,0.9011,0.9186


Unnamed: 0,W,pval,normal
short,0.962545,0.825588,True


Unnamed: 0,W,pval,normal
hier_gcn,0.906726,0.448167,True


Unnamed: 0,W,pval,normal
bert_clf,0.987348,0.969604,True


    split    prompt      f1
0       1     short  0.8748
1       2     short  0.8663
2       3     short  0.8757
3       4     short  0.8943
4       5     short  0.8827
5       1  hier_gcn  0.8886
6       2  hier_gcn  0.9107
7       3  hier_gcn  0.8932
8       4  hier_gcn  0.8920
9       5  hier_gcn  0.9011
10      1  bert_clf  0.9147
11      2  bert_clf  0.9326
12      3  bert_clf  0.9224
13      4  bert_clf  0.9264
14      5  bert_clf  0.9186
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  29.197103  0.000211  0.839176  0.596575


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs hier_gcn,0.87876,0.009352,0.89712,0.00793,-2.446422,0.070717,0.070717,False
1,t-test,short vs bert_clf,0.87876,0.009352,0.92294,0.006202,-7.320512,0.001853,0.003705,True
2,t-test,hier_gcn vs bert_clf,0.89712,0.00793,0.92294,0.006202,-8.863803,0.000895,0.002684,True


### 1000

In [89]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
46,acd,bert_clf,GERestaurant,1000,4,2e-05,16,6.0,0.9254,0.9198,0.8612
28,acd,bert_clf,GERestaurant,1000,5,2e-05,16,6.0,0.9202,0.8965,0.8522
134,acd,bert_clf,GERestaurant,1000,2,2e-05,16,6.0,0.9062,0.8848,0.8285
45,acd,bert_clf,GERestaurant,1000,3,2e-05,16,6.0,0.9053,0.8866,0.827
133,acd,bert_clf,GERestaurant,1000,1,2e-05,16,6.0,0.904,0.8944,0.8249
178,acd,hier_gcn,GERestaurant,1000,3,5e-05,8,43.0,0.8904,,
169,acd,hier_gcn,GERestaurant,1000,2,5e-05,8,43.0,0.8785,,
183,acd,hier_gcn,GERestaurant,1000,5,5e-05,8,43.0,0.872,,
170,acd,hier_gcn,GERestaurant,1000,4,5e-05,8,43.0,0.8682,,
180,acd,hier_gcn,GERestaurant,1000,1,5e-05,8,43.0,0.8614,,


Unnamed: 0,short,long,hier_gcn,bert_clf
1,0.8798,0.8698,0.8614,0.904
2,0.8423,0.8509,0.8785,0.9062
3,0.8625,0.8555,0.8904,0.9053
4,0.8952,0.8993,0.8682,0.9254
5,0.8527,0.8469,0.872,0.9202


Unnamed: 0,W,pval,normal
short,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
hier_gcn,0.975207,0.907495,True


Unnamed: 0,W,pval,normal
bert_clf,0.816104,0.108917,True


    split    prompt      f1
0       1     short  0.8798
1       2     short  0.8423
2       3     short  0.8625
3       4     short  0.8952
4       5     short  0.8527
5       1  hier_gcn  0.8614
6       2  hier_gcn  0.8785
7       3  hier_gcn  0.8904
8       4  hier_gcn  0.8682
9       5  hier_gcn  0.8720
10      1  bert_clf  0.9040
11      2  bert_clf  0.9062
12      3  bert_clf  0.9053
13      4  bert_clf  0.9254
14      5  bert_clf  0.9202
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  12.226242  0.003693  0.692328  0.696723


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs hier_gcn,0.8665,0.018935,0.8741,0.009855,-0.597103,0.582594,0.582594,False
1,t-test,short vs bert_clf,0.8665,0.018935,0.91222,0.008821,-5.248956,0.006302,0.018905,True
2,t-test,hier_gcn vs bert_clf,0.8741,0.009855,0.91222,0.008821,-5.063645,0.007162,0.018905,True


### 500

In [90]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
137,acd,bert_clf,GERestaurant,500,4,2e-05,16,13.0,0.9085,0.9085,0.8323
149,acd,bert_clf,GERestaurant,500,5,2e-05,16,13.0,0.9061,0.8819,0.8284
101,acd,bert_clf,GERestaurant,500,3,2e-05,16,13.0,0.9031,0.8849,0.8233
131,acd,bert_clf,GERestaurant,500,1,2e-05,16,13.0,0.8969,0.8879,0.8131
69,acd,bert_clf,GERestaurant,500,2,2e-05,16,13.0,0.8917,0.8692,0.8046
156,acd,hier_gcn,GERestaurant,500,3,5e-05,8,86.0,0.8828,,
181,acd,hier_gcn,GERestaurant,500,2,5e-05,8,86.0,0.8614,,
185,acd,hier_gcn,GERestaurant,500,5,5e-05,8,86.0,0.8548,,
172,acd,hier_gcn,GERestaurant,500,4,5e-05,8,86.0,0.8427,,
173,acd,hier_gcn,GERestaurant,500,1,5e-05,8,86.0,0.8341,,


Unnamed: 0,short,long,hier_gcn,bert_clf
1,0.887,0.8369,0.8341,0.8969
2,0.8264,0.8215,0.8614,0.8917
3,0.8558,0.8258,0.8828,0.9031
4,0.8674,0.8406,0.8427,0.9085
5,0.8694,0.837,0.8548,0.9061


Unnamed: 0,W,pval,normal
short,0.943288,0.689265,True


Unnamed: 0,W,pval,normal
hier_gcn,0.970341,0.877435,True


Unnamed: 0,W,pval,normal
bert_clf,0.944973,0.701274,True


    split    prompt      f1
0       1     short  0.8870
1       2     short  0.8264
2       3     short  0.8558
3       4     short  0.8674
4       5     short  0.8694
5       1  hier_gcn  0.8341
6       2  hier_gcn  0.8614
7       3  hier_gcn  0.8828
8       4  hier_gcn  0.8427
9       5  hier_gcn  0.8548
10      1  bert_clf  0.8969
11      2  bert_clf  0.8917
12      3  bert_clf  0.9031
13      4  bert_clf  0.9085
14      5  bert_clf  0.9061
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  8.745031  0.009702  0.635344  0.564519


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs hier_gcn,0.8612,0.020058,0.85516,0.016744,0.367813,0.731649,0.731649,False
1,t-test,short vs bert_clf,0.8612,0.020058,0.90126,0.006158,-4.462873,0.011136,0.022271,True
2,t-test,hier_gcn vs bert_clf,0.85516,0.016744,0.90126,0.006158,-5.139222,0.006795,0.020385,True


In [91]:
args.task = 'acd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8798,0.887,0.8748
2,0.8423,0.8264,0.8663
3,0.8625,0.8558,0.8757
4,0.8952,0.8674,0.8943
5,0.8527,0.8694,0.8827


Unnamed: 0,W,pval,normal
1000,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
500,0.943288,0.689265,True


Unnamed: 0,W,pval,normal
full,0.962545,0.825588,True


    split prompt      f1
0       1   1000  0.8798
1       2   1000  0.8423
2       3   1000  0.8625
3       4   1000  0.8952
4       5   1000  0.8527
5       1    500  0.8870
6       2    500  0.8264
7       3    500  0.8558
8       4    500  0.8674
9       5    500  0.8694
10      1   full  0.8748
11      2   full  0.8663
12      3   full  0.8757
13      4   full  0.8943
14      5   full  0.8827
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  2.644723  0.13132  0.160557  0.931991
Results for LR-Comparison of :  short


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8665,0.018935,0.8612,0.020058,0.66752,0.540978,0.540978,False
1,t-test,1000 vs full,0.8665,0.018935,0.87876,0.009352,-1.803222,0.145691,0.336393,False
2,t-test,500 vs full,0.8612,0.020058,0.87876,0.009352,-2.03064,0.112131,0.336393,False


Unnamed: 0,1000,500,full
1,0.8698,0.8369,0.8801
2,0.8509,0.8215,0.8698
3,0.8555,0.8258,0.8674
4,0.8993,0.8406,0.8895
5,0.8469,0.837,0.8846


Unnamed: 0,W,pval,normal
1000,0.85488,0.210443,True


Unnamed: 0,W,pval,normal
500,0.874845,0.286601,True


Unnamed: 0,W,pval,normal
full,0.927728,0.580963,True


    split prompt      f1
0       1   1000  0.8698
1       2   1000  0.8509
2       3   1000  0.8555
3       4   1000  0.8993
4       5   1000  0.8469
5       1    500  0.8369
6       2    500  0.8215
7       3    500  0.8258
8       4    500  0.8406
9       5    500  0.8370
10      1   full  0.8801
11      2   full  0.8698
12      3   full  0.8674
13      4   full  0.8895
14      5   full  0.8846
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  27.441433  0.000262  0.694209  0.527279
Results for LR-Comparison of :  long


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.86448,0.019052,0.83236,0.007362,4.122545,0.014582,0.029164,True
1,t-test,1000 vs full,0.86448,0.019052,0.87828,0.008479,-1.804238,0.145519,0.145519,False
2,t-test,500 vs full,0.83236,0.007362,0.87828,0.008479,-31.162476,6e-06,1.9e-05,True


Unnamed: 0,1000,500,full
1,0.7853,0.797,0.7774
2,0.7276,0.7043,0.7641
3,0.7583,0.7478,0.7789
4,0.8103,0.7658,0.8088
5,0.7432,0.769,0.7901


Unnamed: 0,W,pval,normal
1000,0.967708,0.860352,True


Unnamed: 0,W,pval,normal
500,0.950806,0.742944,True


Unnamed: 0,W,pval,normal
full,0.961033,0.815159,True


    split prompt      f1
0       1   1000  0.7853
1       2   1000  0.7276
2       3   1000  0.7583
3       4   1000  0.8103
4       5   1000  0.7432
5       1    500  0.7970
6       2    500  0.7043
7       3    500  0.7478
8       4    500  0.7658
9       5    500  0.7690
10      1   full  0.7774
11      2   full  0.7641
12      3   full  0.7789
13      4   full  0.8088
14      5   full  0.7901
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  2.605186  0.134493  0.159307  0.935644
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.76494,0.029608,0.75678,0.030604,0.655111,0.548158,0.548158,False
1,t-test,1000 vs full,0.76494,0.029608,0.78386,0.014951,-1.791355,0.147717,0.336193,False
2,t-test,500 vs full,0.75678,0.030604,0.78386,0.014951,-2.031162,0.112064,0.336193,False


## ACSA

In [95]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

###
# Baselines
##

# Multi-label Classifiaction
METHOD = 'bert_clf'
RESULTS_PATH = '../results/'

# col_names = ['task', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_parameters = folder_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier_gcn'
RESULTS_PATH = '../results/'

col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']


folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    cond_parameters = folder_name.split('_')
    cond_params = cond_parameters.copy()
    if cond_params[0] == 'acsa':
        with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'eval_results.txt'), 'r') as f:
            f1 = f.readlines()[3].split(' = ')[1]
                
        cond_params[1:1] = [METHOD]
        cond_params.extend([round(float(f1), 4), None, None])
        runs.append(cond_params)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [96]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
9,acsa,bert_clf,GERestaurant,0,5,2e-05,16,3.0,0.8543,0.7975,0.7457
89,acsa,hier_gcn,GERestaurant,0,4,5e-05,8,20.0,0.8333,,
84,acsa,hier_gcn,GERestaurant,0,5,5e-05,8,20.0,0.8322,,
36,acsa,bert_clf,GERestaurant,0,4,2e-05,16,3.0,0.8296,0.8076,0.7089
7,acsa,bert_clf,GERestaurant,0,3,2e-05,16,3.0,0.8261,0.7755,0.7038
23,acsa,bert_clf,GERestaurant,0,1,2e-05,16,3.0,0.8261,0.7812,0.7038
93,acsa,hier_gcn,GERestaurant,0,2,5e-05,8,20.0,0.8249,,
75,acsa,bert_clf,GERestaurant,0,2,2e-05,16,3.0,0.8225,0.7574,0.6985
102,acsa,hier_gcn,GERestaurant,0,3,5e-05,8,20.0,0.8205,,
91,acsa,hier_gcn,GERestaurant,0,1,5e-05,8,20.0,0.8136,,


Unnamed: 0,short,long,cot,hier_gcn,bert_clf
1,0.8348,0.8439,0.8532,0.8136,0.8261
2,0.8256,0.7776,0.8123,0.8249,0.8225
3,0.8226,0.8356,0.8187,0.8205,0.8261
4,0.8659,0.865,0.8234,0.8333,0.8296
5,0.8331,0.8387,0.8194,0.8322,0.8543


Unnamed: 0,W,pval,normal
short,0.807002,0.092301,True


Unnamed: 0,W,pval,normal
hier_gcn,0.935255,0.632617,True


Unnamed: 0,W,pval,normal
bert_clf,0.729227,0.018894,False


    split    prompt      f1
0       1     short  0.8348
1       2     short  0.8256
2       3     short  0.8226
3       4     short  0.8659
4       5     short  0.8331
5       1  hier_gcn  0.8136
6       2  hier_gcn  0.8249
7       3  hier_gcn  0.8205
8       4  hier_gcn  0.8333
9       5  hier_gcn  0.8322
10      1  bert_clf  0.8261
11      2  bert_clf  0.8225
12      3  bert_clf  0.8261
13      4  bert_clf  0.8296
14      5  bert_clf  0.8543
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.36      2  3.6  0.165299


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,short vs hier_gcn,0.8364,0.015432,0.8249,0.007361,0.0,0.0625,0.1875,False
1,wilcoxon,short vs bert_clf,0.8364,0.015432,0.83172,0.011511,6.0,0.8125,0.8125,False
2,wilcoxon,hier_gcn vs bert_clf,0.8249,0.007361,0.83172,0.011511,3.0,0.3125,0.625,False


### 1000

In [97]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
30,acsa,bert_clf,GERestaurant,1000,5,2e-05,16,6.0,0.868,0.8081,0.7667
65,acsa,bert_clf,GERestaurant,1000,2,2e-05,16,6.0,0.8499,0.8077,0.7389
73,acsa,bert_clf,GERestaurant,1000,4,2e-05,16,6.0,0.8444,0.8175,0.7308
17,acsa,bert_clf,GERestaurant,1000,3,2e-05,16,6.0,0.8352,0.8014,0.717
20,acsa,bert_clf,GERestaurant,1000,1,2e-05,16,6.0,0.8206,0.7892,0.6958
86,acsa,hier_gcn,GERestaurant,1000,3,5e-05,8,43.0,0.8143,,
90,acsa,hier_gcn,GERestaurant,1000,2,5e-05,8,43.0,0.7991,,
117,acsa,hier_gcn,GERestaurant,1000,5,5e-05,8,43.0,0.7982,,
88,acsa,hier_gcn,GERestaurant,1000,4,5e-05,8,43.0,0.7765,,
80,acsa,hier_gcn,GERestaurant,1000,1,5e-05,8,43.0,0.7743,,


Unnamed: 0,short,long,cot,hier_gcn,bert_clf
1,0.8314,0.7744,0.773,0.7743,0.8206
2,0.7488,0.7457,0.802,0.7991,0.8499
3,0.8365,0.7368,0.802,0.8143,0.8352
4,0.8479,0.7973,0.8299,0.7765,0.8444
5,0.7157,0.7579,0.8216,0.7982,0.868


Unnamed: 0,W,pval,normal
cot,0.935542,0.634617,True


Unnamed: 0,W,pval,normal
hier_gcn,0.901674,0.419208,True


Unnamed: 0,W,pval,normal
bert_clf,0.994647,0.993171,True


    split    prompt      f1
0       1       cot  0.7730
1       2       cot  0.8020
2       3       cot  0.8020
3       4       cot  0.8299
4       5       cot  0.8216
5       1  hier_gcn  0.7743
6       2  hier_gcn  0.7991
7       3  hier_gcn  0.8143
8       4  hier_gcn  0.7765
9       5  hier_gcn  0.7982
10      1  bert_clf  0.8206
11      2  bert_clf  0.8499
12      3  bert_clf  0.8352
13      4  bert_clf  0.8444
14      5  bert_clf  0.8680
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  16.613767  0.001418  0.620547  0.713032


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs hier_gcn,0.8057,0.019673,0.79248,0.01509,1.140616,0.317685,0.317685,False
1,t-test,cot vs bert_clf,0.8057,0.019673,0.84362,0.015718,-5.865206,0.004219,0.012657,True
2,t-test,hier_gcn vs bert_clf,0.79248,0.01509,0.84362,0.015718,-5.777347,0.004458,0.012657,True


### 500

In [98]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
3,acsa,bert_clf,GERestaurant,500,5,2e-05,16,13.0,0.8426,0.7523,0.728
33,acsa,bert_clf,GERestaurant,500,3,2e-05,16,13.0,0.8154,0.767,0.6883
0,acsa,bert_clf,GERestaurant,500,4,2e-05,16,13.0,0.8148,0.779,0.6875
6,acsa,bert_clf,GERestaurant,500,2,2e-05,16,13.0,0.8075,0.757,0.6771
22,acsa,bert_clf,GERestaurant,500,1,2e-05,16,13.0,0.8044,0.7525,0.6728
111,acsa,hier_gcn,GERestaurant,500,4,5e-05,8,20.0,0.7576,,
99,acsa,hier_gcn,GERestaurant,500,3,5e-05,8,20.0,0.7491,,
104,acsa,hier_gcn,GERestaurant,500,5,5e-05,8,20.0,0.7436,,
82,acsa,hier_gcn,GERestaurant,500,2,5e-05,8,20.0,0.7267,,
107,acsa,hier_gcn,GERestaurant,500,1,5e-05,8,20.0,0.7143,,


Unnamed: 0,short,long,cot,hier_gcn,bert_clf
1,0.7748,0.8249,0.8168,0.7143,0.8044
2,0.7871,0.7988,0.7859,0.7267,0.8075
3,0.7951,0.7935,0.827,0.7491,0.8154
4,0.8316,0.8387,0.8436,0.7576,0.8148
5,0.7568,0.7496,0.8182,0.7436,0.8426


Unnamed: 0,W,pval,normal
cot,0.942179,0.681379,True


Unnamed: 0,W,pval,normal
hier_gcn,0.951836,0.750298,True


Unnamed: 0,W,pval,normal
bert_clf,0.817185,0.111049,True


    split    prompt      f1
0       1       cot  0.8168
1       2       cot  0.7859
2       3       cot  0.8270
3       4       cot  0.8436
4       5       cot  0.8182
5       1  hier_gcn  0.7143
6       2  hier_gcn  0.7267
7       3  hier_gcn  0.7491
8       4  hier_gcn  0.7576
9       5  hier_gcn  0.7436
10      1  bert_clf  0.8044
11      2  bert_clf  0.8075
12      3  bert_clf  0.8154
13      4  bert_clf  0.8148
14      5  bert_clf  0.8426
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  58.040876  0.000017  0.843113  0.797698


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs hier_gcn,0.8183,0.018803,0.73826,0.01567,11.271831,0.000353,0.001059,True
1,t-test,cot vs bert_clf,0.8183,0.018803,0.81694,0.013504,0.130551,0.902433,0.902433,False
2,t-test,hier_gcn vs bert_clf,0.73826,0.01567,0.81694,0.013504,-10.320217,0.000497,0.001059,True


In [99]:
args.task = 'acsa'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8314,0.7748,0.8348
2,0.7488,0.7871,0.8256
3,0.8365,0.7951,0.8226
4,0.8479,0.8316,0.8659
5,0.7157,0.7568,0.8331


Unnamed: 0,W,pval,normal
1000,0.833497,0.147732,True


Unnamed: 0,W,pval,normal
500,0.964037,0.835776,True


Unnamed: 0,W,pval,normal
full,0.807002,0.092301,True


    split prompt      f1
0       1   1000  0.8314
1       2   1000  0.7488
2       3   1000  0.8365
3       4   1000  0.8479
4       5   1000  0.7157
5       1    500  0.7748
6       2    500  0.7871
7       3    500  0.7951
8       4    500  0.8316
9       5    500  0.7568
10      1   full  0.8348
11      2   full  0.8256
12      3   full  0.8226
13      4   full  0.8659
14      5   full  0.8331
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.590351  0.077125  0.260254  0.588467
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.79606,0.05341,0.78908,0.024876,0.346931,0.746127,0.746127,False
1,t-test,1000 vs full,0.79606,0.05341,0.8364,0.015432,-1.641874,0.175961,0.351921,False
2,t-test,500 vs full,0.78908,0.024876,0.8364,0.015432,-5.224397,0.006408,0.019225,True


Unnamed: 0,1000,500,full
1,0.7744,0.8249,0.8439
2,0.7457,0.7988,0.7776
3,0.7368,0.7935,0.8356
4,0.7973,0.8387,0.865
5,0.7579,0.7496,0.8387


Unnamed: 0,W,pval,normal
1000,0.959187,0.802308,True


Unnamed: 0,W,pval,normal
500,0.950268,0.7391,True


Unnamed: 0,W,pval,normal
full,0.850142,0.194967,True


    split prompt      f1
0       1   1000  0.7744
1       2   1000  0.7457
2       3   1000  0.7368
3       4   1000  0.7973
4       5   1000  0.7579
5       1    500  0.8249
6       2    500  0.7988
7       3    500  0.7935
8       4    500  0.8387
9       5    500  0.7496
10      1   full  0.8439
11      2   full  0.7776
12      3   full  0.8356
13      4   full  0.8650
14      5   full  0.8387
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  12.542534  0.003418  0.520195  0.706028
Results for LR-Comparison of :  long


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.76242,0.021524,0.8011,0.030645,-3.219486,0.032296,0.064593,False
1,t-test,1000 vs full,0.76242,0.021524,0.83216,0.029147,-6.36253,0.003128,0.009385,True
2,t-test,500 vs full,0.8011,0.030645,0.83216,0.029147,-1.737701,0.15726,0.15726,False


Unnamed: 0,1000,500,full
1,0.773,0.8168,0.8532
2,0.802,0.7859,0.8123
3,0.802,0.827,0.8187
4,0.8299,0.8436,0.8234
5,0.8216,0.8182,0.8194


Unnamed: 0,W,pval,normal
1000,0.935542,0.634617,True


Unnamed: 0,W,pval,normal
500,0.942179,0.681379,True


Unnamed: 0,W,pval,normal
full,0.779059,0.054104,True


    split prompt      f1
0       1   1000  0.7730
1       2   1000  0.8020
2       3   1000  0.8020
3       4   1000  0.8299
4       5   1000  0.8216
5       1    500  0.8168
6       2    500  0.7859
7       3    500  0.8270
8       4    500  0.8436
9       5    500  0.8182
10      1   full  0.8532
11      2   full  0.8123
12      3   full  0.8187
13      4   full  0.8234
14      5   full  0.8194
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  1.273475  0.331018  0.173793  0.75167
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8057,0.019673,0.8183,0.018803,-1.199157,0.296645,0.832851,False
1,t-test,1000 vs full,0.8057,0.019673,0.8254,0.014348,-1.255559,0.277617,0.832851,False
2,t-test,500 vs full,0.8183,0.018803,0.8254,0.014348,-0.669656,0.539749,0.832851,False


Unnamed: 0,1000,500,full
1,0.6299,0.6903,0.7164
2,0.6694,0.6474,0.7029
3,0.6695,0.705,0.6987
4,0.7093,0.7295,0.7635
5,0.6972,0.6923,0.714


Unnamed: 0,W,pval,normal
1000,0.939759,0.664233,True


Unnamed: 0,W,pval,normal
500,0.947801,0.721476,True


Unnamed: 0,W,pval,normal
full,0.799844,0.080797,True


    split prompt      f1
0       1   1000  0.6299
1       2   1000  0.6694
2       3   1000  0.6695
3       4   1000  0.7093
4       5   1000  0.6972
5       1    500  0.6903
6       2    500  0.6474
7       3    500  0.7050
8       4    500  0.7295
9       5    500  0.6923
10      1   full  0.7164
11      2   full  0.7029
12      3   full  0.6987
13      4   full  0.7635
14      5   full  0.7140
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  6.394666  0.021928  0.328922  0.84893
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.67506,0.027441,0.6929,0.026696,-1.22625,0.287355,0.287355,False
1,t-test,1000 vs full,0.67506,0.027441,0.7191,0.023166,-3.608253,0.022592,0.067775,False
2,t-test,500 vs full,0.6929,0.026696,0.7191,0.023166,-2.622729,0.058631,0.117263,False


## E2E

In [122]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# InstructABSA
METHOD = 'instructABSA'
RESULTS_PATH = '../results/'
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

filenames = [file for file in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if file != '.ipynb_checkpoints']

for file in filenames:
    try:
        cond_name = file.split('.tsv')[0]
        cond_parameters = cond_name.split('_')
        
        with open(os.path.join(RESULTS_PATH, METHOD, file), 'r') as f:
            f1 = f.readlines()[-1].split('\t')[1]
        
        cond_parameters.extend([round(float(f1),4), None,None])
        cond_parameters.insert(0, 'e2e')   # Task
        cond_parameters.insert(1, METHOD)  # Method
        cond_parameters.insert(6, 8)       # Batch Size
    
        if cond_parameters[3] == 'full':
            cond_parameters[3] = '0'

        runs.append(cond_parameters)
        
    except:
        pass


# TAS-BERT

METHOD = 'tas_bert'
RESULTS_PATH = '../results/'

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if folder != '.ipynb_checkpoints']
for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.extend([round(float(f1),4), None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [121]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
34,e2e,instructABSA,GERestaurant,0,5,5e-05,8,4.0,0.7371,,
49,e2e,tas_bert,GERestaurant,0,3,2e-05,24,30.0,0.7271,,
54,e2e,tas_bert,GERestaurant,0,5,2e-05,24,30.0,0.7253,,
15,e2e,instructABSA,GERestaurant,0,4,5e-05,8,4.0,0.7217,,
43,e2e,tas_bert,GERestaurant,0,4,2e-05,24,30.0,0.7163,,
35,e2e,instructABSA,GERestaurant,0,2,5e-05,8,4.0,0.713,,
12,e2e,instructABSA,GERestaurant,0,1,5e-05,8,4.0,0.7071,,
38,e2e,instructABSA,GERestaurant,0,3,5e-05,8,4.0,0.696,,
70,e2e,tas_bert,GERestaurant,0,2,2e-05,24,30.0,0.6944,,
65,e2e,tas_bert,GERestaurant,0,1,2e-05,24,30.0,0.6896,,


Unnamed: 0,short,long,cot,instructABSA,tas_bert
1,0.7923,0.7876,0.7668,0.7071,0.6896
2,0.7865,0.75,0.7689,0.713,0.6944
3,0.8,0.7861,0.765,0.696,0.7271
4,0.8335,0.8187,0.7888,0.7217,0.7163
5,0.8178,0.8004,0.7719,0.7371,0.7253


Unnamed: 0,W,pval,normal
short,0.9325,0.613512,True


Unnamed: 0,W,pval,normal
instructABSA,0.990039,0.979831,True


Unnamed: 0,W,pval,normal
tas_bert,0.856518,0.216014,True


    split        prompt      f1
0       1         short  0.7923
1       2         short  0.7865
2       3         short  0.8000
3       4         short  0.8335
4       5         short  0.8178
5       1  instructABSA  0.7071
6       2  instructABSA  0.7130
7       3  instructABSA  0.6960
8       4  instructABSA  0.7217
9       5  instructABSA  0.7371
10      1      tas_bert  0.6896
11      2      tas_bert  0.6944
12      3      tas_bert  0.7271
13      4      tas_bert  0.7163
14      5      tas_bert  0.7253
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  91.927355  0.000003  0.887414  0.898106


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructABSA,0.80602,0.017317,0.71498,0.013862,12.579026,0.00023,0.000579,True
1,t-test,short vs tas_bert,0.80602,0.017317,0.71054,0.015648,13.152164,0.000193,0.000579,True
2,t-test,instructABSA vs tas_bert,0.71498,0.013862,0.71054,0.015648,0.483115,0.654266,0.654266,False


### 1000

In [123]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
23,e2e,instructABSA,GERestaurant,1000,4,5e-05,8,9.0,0.7133,,
25,e2e,instructABSA,GERestaurant,1000,5,5e-05,8,9.0,0.7079,,
10,e2e,instructABSA,GERestaurant,1000,3,5e-05,8,9.0,0.7004,,
37,e2e,instructABSA,GERestaurant,1000,1,5e-05,8,9.0,0.6889,,
28,e2e,instructABSA,GERestaurant,1000,2,5e-05,8,9.0,0.6877,,
64,e2e,tas_bert,GERestaurant,1000,4,2e-05,24,13.0,0.6725,,
51,e2e,tas_bert,GERestaurant,1000,2,2e-05,24,13.0,0.6716,,
55,e2e,tas_bert,GERestaurant,1000,3,2e-05,24,13.0,0.6708,,
63,e2e,tas_bert,GERestaurant,1000,5,2e-05,24,13.0,0.6624,,
66,e2e,tas_bert,GERestaurant,1000,1,2e-05,24,13.0,0.6555,,


Unnamed: 0,short,long,cot,instructABSA,tas_bert
1,0.7992,0.7785,0.7288,0.6889,0.6555
2,0.7455,0.7703,0.7223,0.6877,0.6716
3,0.7953,0.7758,0.7618,0.7004,0.6708
4,0.8136,0.8069,0.7614,0.7133,0.6725
5,0.7681,0.7893,0.7713,0.7079,0.6624


Unnamed: 0,W,pval,normal
short,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
instructABSA,0.908972,0.461436,True


Unnamed: 0,W,pval,normal
tas_bert,0.83729,0.157568,True


    split        prompt      f1
0       1         short  0.7992
1       2         short  0.7455
2       3         short  0.7953
3       4         short  0.8136
4       5         short  0.7681
5       1  instructABSA  0.6889
6       2  instructABSA  0.6877
7       3  instructABSA  0.7004
8       4  instructABSA  0.7133
9       5  instructABSA  0.7079
10      1      tas_bert  0.6555
11      2      tas_bert  0.6716
12      3      tas_bert  0.6708
13      4      tas_bert  0.6725
14      5      tas_bert  0.6624
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  71.730861  0.000008  0.908847  0.608962


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructABSA,0.78434,0.02437,0.69964,0.010132,7.853001,0.001421,0.002841,True
1,t-test,short vs tas_bert,0.78434,0.02437,0.66656,0.006607,9.12474,0.0008,0.002401,True
2,t-test,instructABSA vs tas_bert,0.69964,0.010132,0.66656,0.006607,6.522815,0.002853,0.002853,True


### 500

In [124]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
13,e2e,instructABSA,GERestaurant,500,3,5e-05,8,17.0,0.6898,,
8,e2e,instructABSA,GERestaurant,500,4,5e-05,8,17.0,0.6834,,
26,e2e,instructABSA,GERestaurant,500,1,5e-05,8,17.0,0.6818,,
30,e2e,instructABSA,GERestaurant,500,5,5e-05,8,17.0,0.6722,,
6,e2e,instructABSA,GERestaurant,500,2,5e-05,8,17.0,0.6699,,
42,e2e,tas_bert,GERestaurant,500,3,2e-05,24,24.0,0.6164,,
67,e2e,tas_bert,GERestaurant,500,4,2e-05,24,24.0,0.6131,,
73,e2e,tas_bert,GERestaurant,500,5,2e-05,24,24.0,0.6087,,
58,e2e,tas_bert,GERestaurant,500,1,2e-05,24,24.0,0.6072,,
71,e2e,tas_bert,GERestaurant,500,2,2e-05,24,24.0,0.5978,,


Unnamed: 0,short,long,cot,instructABSA,tas_bert
1,0.7458,0.7271,0.6693,0.6818,0.6072
2,0.7606,0.7129,0.7179,0.6699,0.5978
3,0.6998,0.6775,0.6729,0.6898,0.6164
4,0.7563,0.7069,0.7082,0.6834,0.6131
5,0.7301,0.7094,0.7154,0.6722,0.6087


Unnamed: 0,W,pval,normal
short,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
instructABSA,0.932879,0.616134,True


Unnamed: 0,W,pval,normal
tas_bert,0.951778,0.749884,True


    split        prompt      f1
0       1         short  0.7458
1       2         short  0.7606
2       3         short  0.6998
3       4         short  0.7563
4       5         short  0.7301
5       1  instructABSA  0.6818
6       2  instructABSA  0.6699
7       3  instructABSA  0.6898
8       4  instructABSA  0.6834
9       5  instructABSA  0.6722
10      1      tas_bert  0.6072
11      2      tas_bert  0.5978
12      3      tas_bert  0.6164
13      4      tas_bert  0.6131
14      5      tas_bert  0.6087
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  69.786326  0.000009  0.935852  0.515968


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructABSA,0.73852,0.022031,0.67942,0.007376,4.38968,0.011786,0.011786,True
1,t-test,short vs tas_bert,0.73852,0.022031,0.60864,0.00632,9.721971,0.000627,0.001254,True
2,t-test,instructABSA vs tas_bert,0.67942,0.007376,0.60864,0.00632,36.207564,3e-06,1e-05,True


In [125]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7992,0.7458,0.7923
2,0.7455,0.7606,0.7865
3,0.7953,0.6998,0.8
4,0.8136,0.7563,0.8335
5,0.7681,0.7301,0.8178


Unnamed: 0,W,pval,normal
1000,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
500,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
full,0.9325,0.613512,True


    split prompt      f1
0       1   1000  0.7992
1       2   1000  0.7455
2       3   1000  0.7953
3       4   1000  0.8136
4       5   1000  0.7681
5       1    500  0.7458
6       2    500  0.7606
7       3    500  0.6998
8       4    500  0.7563
9       5    500  0.7301
10      1   full  0.7923
11      2   full  0.7865
12      3   full  0.8000
13      4   full  0.8335
14      5   full  0.8178
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  11.456719  0.004485  0.632663  0.743232
Results for LR-Comparison of :  short


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78434,0.02437,0.73852,0.022031,2.555484,0.062938,0.125877,False
1,t-test,1000 vs full,0.78434,0.02437,0.80602,0.017317,-2.036472,0.11139,0.125877,False
2,t-test,500 vs full,0.73852,0.022031,0.80602,0.017317,-4.934124,0.00785,0.023549,True


Unnamed: 0,1000,500,full
1,0.7785,0.7271,0.7876
2,0.7703,0.7129,0.75
3,0.7758,0.6775,0.7861
4,0.8069,0.7069,0.8187
5,0.7893,0.7094,0.8004


Unnamed: 0,W,pval,normal
1000,0.908464,0.458414,True


Unnamed: 0,W,pval,normal
500,0.900947,0.415142,True


Unnamed: 0,W,pval,normal
full,0.948692,0.727844,True


    split prompt      f1
0       1   1000  0.7785
1       2   1000  0.7703
2       3   1000  0.7758
3       4   1000  0.8069
4       5   1000  0.7893
5       1    500  0.7271
6       2    500  0.7129
7       3    500  0.6775
8       4    500  0.7069
9       5    500  0.7094
10      1   full  0.7876
11      2   full  0.7500
12      3   full  0.7861
13      4   full  0.8187
14      5   full  0.8004
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  36.544787  0.000095  0.818487  0.578683
Results for LR-Comparison of :  long


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78416,0.012943,0.70676,0.016215,7.682936,0.001544,0.004631,True
1,t-test,1000 vs full,0.78416,0.012943,0.78856,0.02255,-0.710676,0.516527,0.516527,False
2,t-test,500 vs full,0.70676,0.016215,0.78856,0.02255,-5.675453,0.004756,0.009511,True


Unnamed: 0,1000,500,full
1,0.7288,0.6693,0.7668
2,0.7223,0.7179,0.7689
3,0.7618,0.6729,0.765
4,0.7614,0.7082,0.7888
5,0.7713,0.7154,0.7719


Unnamed: 0,W,pval,normal
1000,0.848695,0.190429,True


Unnamed: 0,W,pval,normal
500,0.80553,0.089826,True


Unnamed: 0,W,pval,normal
full,0.79095,0.068237,True


    split prompt      f1
0       1   1000  0.7288
1       2   1000  0.7223
2       3   1000  0.7618
3       4   1000  0.7614
4       5   1000  0.7713
5       1    500  0.6693
6       2    500  0.7179
7       3    500  0.6729
8       4    500  0.7082
9       5    500  0.7154
10      1   full  0.7668
11      2   full  0.7689
12      3   full  0.7650
13      4   full  0.7888
14      5   full  0.7719
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2       eps
0  prompt      2      8  25.16706  0.000354  0.76692  0.764978
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.74912,0.019676,0.69674,0.021206,3.851289,0.018282,0.036565,True
1,t-test,1000 vs full,0.74912,0.019676,0.77228,0.008573,-2.515763,0.065653,0.065653,False
2,t-test,500 vs full,0.69674,0.021206,0.77228,0.008573,-8.082931,0.001273,0.003819,True


Unnamed: 0,1000,500,full
1,0.6655,0.5946,0.6561
2,0.5943,0.6137,0.6481
3,0.6602,0.5382,0.6667
4,0.6858,0.6081,0.7145
5,0.6235,0.5749,0.6918


Unnamed: 0,W,pval,normal
1000,0.941844,0.679005,True


Unnamed: 0,W,pval,normal
500,0.906041,0.444169,True


Unnamed: 0,W,pval,normal
full,0.93019,0.597668,True


    split prompt      f1
0       1   1000  0.6655
1       2   1000  0.5943
2       3   1000  0.6602
3       4   1000  0.6858
4       5   1000  0.6235
5       1    500  0.5946
6       2    500  0.6137
7       3    500  0.5382
8       4    500  0.6081
9       5    500  0.5749
10      1   full  0.6561
11      2   full  0.6481
12      3   full  0.6667
13      4   full  0.7145
14      5   full  0.6918
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  11.76602  0.004143  0.632863  0.77084
Results for LR-Comparison of best Prompt per LR-Setting


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.64586,0.032695,0.5859,0.027341,2.591037,0.060618,0.121235,False
1,t-test,1000 vs full,0.64586,0.032695,0.67544,0.024459,-2.056263,0.108913,0.121235,False
2,t-test,500 vs full,0.5859,0.027341,0.67544,0.024459,-5.01431,0.007415,0.022245,True


## E2E - without Implicit

In [31]:
runs = []
RESULTS_PATH = '../results_final/filtered/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# InstructABSA
METHOD = 'instructAbsa'
RESULTS_PATH = '../../../ABSA-Baselines/InstructABSA-Custom/Output_filtered'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

file_names = [file for file in os.listdir(RESULTS_PATH) if len(file.split('.tsv')) > 1 and file != '.ipynb_checkpoints']

for file_name in file_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, file_name), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = file_name.split('.tsv')[0]
    cond_parameters = cond_name.split('_')
    
    cond_parameters.append(metrics_dict['F1-Score'])
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method
    cond_parameters.insert(6, 8)       # Batch Size

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'
    
    runs.append(cond_parameters)


# TAS-BERT

METHOD = 'tas-bert'
RESULTS_PATH = '../../../ABSA-Baselines/TAS-BERT-Custom/results_filtered/GERestaurant/three_joint/BIO'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [file for file in os.listdir(RESULTS_PATH) if file != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.append(float(f1))
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [32]:
args.lr_setting = 0
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
54,e2e,tas-bert,GERestaurant,0,3,2e-05,24,25.0,0.7708,,
40,e2e,tas-bert,GERestaurant,0,5,2e-05,24,25.0,0.7427,,
51,e2e,tas-bert,GERestaurant,0,1,2e-05,24,25.0,0.7346,,
48,e2e,tas-bert,GERestaurant,0,2,2e-05,24,25.0,0.7211,,
57,e2e,tas-bert,GERestaurant,0,4,2e-05,24,25.0,0.6926,,
38,e2e,instructAbsa,GERestaurant,0,3,5e-05,8,4.0,0.623529,,
35,e2e,instructAbsa,GERestaurant,0,2,5e-05,8,4.0,0.6141,,
34,e2e,instructAbsa,GERestaurant,0,5,5e-05,8,4.0,0.601457,,
15,e2e,instructAbsa,GERestaurant,0,4,5e-05,8,4.0,0.592284,,
12,e2e,instructAbsa,GERestaurant,0,1,5e-05,8,4.0,0.570265,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.8237,0.8284,0.8173,0.570265,0.7346
2,0.7497,0.6943,0.7918,0.6141,0.7211
3,0.8431,0.8365,0.852,0.623529,0.7708
4,0.7857,0.8154,0.7652,0.592284,0.6926
5,0.8039,0.7256,0.8055,0.601457,0.7427


Unnamed: 0,W,pval,normal
cot,0.991887,0.985867,True


Unnamed: 0,W,pval,normal
instructAbsa,0.974128,0.901028,True


Unnamed: 0,W,pval,normal
tas-bert,0.991894,0.98589,True


    split        prompt        f1
0       1           cot  0.817300
1       2           cot  0.791800
2       3           cot  0.852000
3       4           cot  0.765200
4       5           cot  0.805500
5       1  instructAbsa  0.570265
6       2  instructAbsa  0.614100
7       3  instructAbsa  0.623529
8       4  instructAbsa  0.592284
9       5  instructAbsa  0.601457
10      1      tas-bert  0.734600
11      2      tas-bert  0.721100
12      3      tas-bert  0.770800
13      4      tas-bert  0.692600
14      5      tas-bert  0.742700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F         p-unc       ng2       eps
0  prompt      2      8  177.919903  2.337325e-07  0.922866  0.537248


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs instructAbsa,0.80636,0.028669,0.600327,0.018427,14.410956,0.000135,0.00027,True
1,t-test,cot vs tas-bert,0.80636,0.028669,0.73236,0.02568,20.297983,3.5e-05,0.000104,True
2,t-test,instructAbsa vs tas-bert,0.600327,0.018427,0.73236,0.02568,-10.79296,0.000418,0.000418,True


### 1000

In [33]:
args.lr_setting = 1000
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
46,e2e,tas-bert,GERestaurant,1000,3,2e-05,24,25.0,0.7644,,
44,e2e,tas-bert,GERestaurant,1000,2,2e-05,24,25.0,0.7173,,
47,e2e,tas-bert,GERestaurant,1000,5,2e-05,24,25.0,0.7087,,
42,e2e,tas-bert,GERestaurant,1000,1,2e-05,24,25.0,0.7064,,
52,e2e,tas-bert,GERestaurant,1000,4,2e-05,24,25.0,0.6827,,
25,e2e,instructAbsa,GERestaurant,1000,5,5e-05,8,9.0,0.67364,,
28,e2e,instructAbsa,GERestaurant,1000,2,5e-05,8,9.0,0.656834,,
10,e2e,instructAbsa,GERestaurant,1000,3,5e-05,8,9.0,0.630435,,
37,e2e,instructAbsa,GERestaurant,1000,1,5e-05,8,9.0,0.623053,,
23,e2e,instructAbsa,GERestaurant,1000,4,5e-05,8,9.0,0.617464,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.804,0.8152,0.8085,0.623053,0.7064
2,0.8161,0.6942,0.8241,0.656834,0.7173
3,0.8259,0.8583,0.8108,0.630435,0.7644
4,0.7888,0.7506,0.7668,0.617464,0.6827
5,0.7688,0.7923,0.7867,0.67364,0.7087


Unnamed: 0,W,pval,normal
short,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
instructAbsa,0.899257,0.405792,True


Unnamed: 0,W,pval,normal
tas-bert,0.900709,0.413814,True


    split        prompt        f1
0       1         short  0.804000
1       2         short  0.816100
2       3         short  0.825900
3       4         short  0.788800
4       5         short  0.768800
5       1  instructAbsa  0.623053
6       2  instructAbsa  0.656834
7       3  instructAbsa  0.630435
8       4  instructAbsa  0.617464
9       5  instructAbsa  0.673640
10      1      tas-bert  0.706400
11      2      tas-bert  0.717300
12      3      tas-bert  0.764400
13      4      tas-bert  0.682700
14      5      tas-bert  0.708700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  57.508952  0.000018  0.890244  0.757303


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.80072,0.02021,0.640285,0.021453,9.240477,0.000762,0.002287,True
1,t-test,short vs tas-bert,0.80072,0.02021,0.7159,0.026827,8.553924,0.001025,0.002287,True
2,t-test,instructAbsa vs tas-bert,0.640285,0.021453,0.7159,0.026827,-4.58211,0.010167,0.010167,True


### 500

In [34]:
args.lr_setting = 500
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
53,e2e,tas-bert,GERestaurant,500,3,2e-05,24,19.0,0.7168,,
49,e2e,tas-bert,GERestaurant,500,1,2e-05,24,19.0,0.6741,,
55,e2e,tas-bert,GERestaurant,500,5,2e-05,24,19.0,0.6667,,
43,e2e,tas-bert,GERestaurant,500,2,2e-05,24,19.0,0.6628,,
45,e2e,tas-bert,GERestaurant,500,4,2e-05,24,19.0,0.6344,,
13,e2e,instructAbsa,GERestaurant,500,3,5e-05,8,17.0,0.618537,,
6,e2e,instructAbsa,GERestaurant,500,2,5e-05,8,17.0,0.616132,,
8,e2e,instructAbsa,GERestaurant,500,4,5e-05,8,17.0,0.609582,,
30,e2e,instructAbsa,GERestaurant,500,5,5e-05,8,17.0,0.593588,,
26,e2e,instructAbsa,GERestaurant,500,1,5e-05,8,17.0,0.567318,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.7968,0.7556,0.7626,0.567318,0.6741
2,0.7248,0.7984,0.7737,0.616132,0.6628
3,0.8251,0.8159,0.7947,0.618537,0.7168
4,0.7984,0.7291,0.7694,0.609582,0.6344
5,0.7431,0.7102,0.7796,0.593588,0.6667


Unnamed: 0,W,pval,normal
short,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
instructAbsa,0.866267,0.251635,True


Unnamed: 0,W,pval,normal
tas-bert,0.935736,0.635974,True


    split        prompt        f1
0       1         short  0.796800
1       2         short  0.724800
2       3         short  0.825100
3       4         short  0.798400
4       5         short  0.743100
5       1  instructAbsa  0.567318
6       2  instructAbsa  0.616132
7       3  instructAbsa  0.618537
8       4  instructAbsa  0.609582
9       5  instructAbsa  0.593588
10      1      tas-bert  0.674100
11      2      tas-bert  0.662800
12      3      tas-bert  0.716800
13      4      tas-bert  0.634400
14      5      tas-bert  0.666700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  46.536437  0.000039  0.864789  0.873377


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.77764,0.037512,0.601031,0.018972,8.237915,0.001184,0.003552,True
1,t-test,short vs tas-bert,0.77764,0.037512,0.67096,0.02658,5.93674,0.004036,0.008072,True
2,t-test,instructAbsa vs tas-bert,0.601031,0.018972,0.67096,0.02658,-4.538892,0.010506,0.010506,True


In [35]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.804,0.7968,0.8237
2,0.8161,0.7248,0.7497
3,0.8259,0.8251,0.8431
4,0.7888,0.7984,0.7857
5,0.7688,0.7431,0.8039


Unnamed: 0,W,pval,normal
1000,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
500,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
full,0.983488,0.952373,True


    split prompt      f1
0       1   1000  0.8040
1       2   1000  0.8161
2       3   1000  0.8259
3       4   1000  0.7888
4       5   1000  0.7688
5       1    500  0.7968
6       2    500  0.7248
7       3    500  0.8251
8       4    500  0.7984
9       5    500  0.7431
10      1   full  0.8237
11      2   full  0.7497
12      3   full  0.8431
13      4   full  0.7857
14      5   full  0.8039


  W = np.prod(eig) / (eig.sum() / d) ** d


Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.400032  0.301061  0.113048  0.818557
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.80072,0.02021,0.77764,0.037512,1.282481,0.268951,0.537901,False
1,t-test,1000 vs full,0.80072,0.02021,0.80122,0.03213,-0.028098,0.97893,0.97893,False
2,t-test,500 vs full,0.77764,0.037512,0.80122,0.03213,-2.012568,0.114463,0.34339,False


Unnamed: 0,1000,500,full
1,0.8152,0.7556,0.8284
2,0.6942,0.7984,0.6943
3,0.8583,0.8159,0.8365
4,0.7506,0.7291,0.8154
5,0.7923,0.7102,0.7256


Unnamed: 0,W,pval,normal
1000,0.988245,0.973192,True


Unnamed: 0,W,pval,normal
500,0.941347,0.675478,True


Unnamed: 0,W,pval,normal
full,0.833036,0.146574,True


    split prompt      f1
0       1   1000  0.8152
1       2   1000  0.6942
2       3   1000  0.8583
3       4   1000  0.7506
4       5   1000  0.7923
5       1    500  0.7556
6       2    500  0.7984
7       3    500  0.8159
8       4    500  0.7291
9       5    500  0.7102
10      1   full  0.8284
11      2   full  0.6943
12      3   full  0.8365
13      4   full  0.8154
14      5   full  0.7256
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  0.280667  0.76242  0.029568  0.813498
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78212,0.05608,0.76184,0.040092,0.620648,0.568446,1.0,False
1,t-test,1000 vs full,0.78212,0.05608,0.78004,0.058467,0.096583,0.927703,1.0,False
2,t-test,500 vs full,0.76184,0.040092,0.78004,0.058467,-0.541541,0.616888,1.0,False


Unnamed: 0,1000,500,full
1,0.8085,0.7626,0.8173
2,0.8241,0.7737,0.7918
3,0.8108,0.7947,0.852
4,0.7668,0.7694,0.7652
5,0.7867,0.7796,0.8055


Unnamed: 0,W,pval,normal
1000,0.941549,0.676906,True


Unnamed: 0,W,pval,normal
500,0.958088,0.794608,True


Unnamed: 0,W,pval,normal
full,0.991887,0.985867,True


    split prompt      f1
0       1   1000  0.8085
1       2   1000  0.8241
2       3   1000  0.8108
3       4   1000  0.7668
4       5   1000  0.7867
5       1    500  0.7626
6       2    500  0.7737
7       3    500  0.7947
8       4    500  0.7694
9       5    500  0.7796
10      1   full  0.8173
11      2   full  0.7918
12      3   full  0.8520
13      4   full  0.7652
14      5   full  0.8055
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  3.868317  0.06679  0.272581  0.975992
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.79938,0.020233,0.776,0.010873,2.214037,0.091215,0.182431,False
1,t-test,1000 vs full,0.79938,0.020233,0.80636,0.028669,-0.576626,0.595087,0.595087,False
2,t-test,500 vs full,0.776,0.010873,0.80636,0.028669,-2.621339,0.058717,0.17615,False


Unnamed: 0,1000,500,full
1,0.804,0.7968,0.8173
2,0.8161,0.7248,0.7918
3,0.8259,0.8251,0.852
4,0.7888,0.7984,0.7652
5,0.7688,0.7431,0.8055


Unnamed: 0,W,pval,normal
1000,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
500,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
full,0.991887,0.985867,True


    split prompt      f1
0       1   1000  0.8040
1       2   1000  0.8161
2       3   1000  0.8259
3       4   1000  0.7888
4       5   1000  0.7688
5       1    500  0.7968
6       2    500  0.7248
7       3    500  0.8251
8       4    500  0.7984
9       5    500  0.7431
10      1   full  0.8173
11      2   full  0.7918
12      3   full  0.8520
13      4   full  0.7652
14      5   full  0.8055
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.717831  0.239505  0.149361  0.857445
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.80072,0.02021,0.77764,0.037512,1.282481,0.268951,0.559451,False
1,t-test,1000 vs full,0.80072,0.02021,0.80636,0.028669,-0.446342,0.678447,0.678447,False
2,t-test,500 vs full,0.77764,0.037512,0.80636,0.028669,-1.592527,0.186484,0.559451,False


## TASD

In [193]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# Paraphrase Generation
METHOD = 'para'
RESULTS_PATH = '../results/'
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_phrases.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        # Fix for the wrong output name format
        # cond_parameters[0], cond_parameters[1] = cond_parameters[1], cond_parameters[0]
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        runs.append(cond_parameters)
    except:
        pass

# MVP
METHOD = 'mvp'
RESULTS_PATH = '../results/'

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:

    with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'result.txt'), 'r') as file:
        f1 = file.readlines()[-1].split(' ')[1]
            
    cond_name = folder_name.split('/')[-1]
    cond_parameters = cond_name.split('_')

    cond_parameters.extend([round(float(f1)/100, 4), None, None])
    cond_parameters[1:1] = [METHOD]
    cond_parameters[0], cond_parameters[2] = cond_parameters[2], cond_parameters[0]
    cond_parameters[3], cond_parameters[4] = cond_parameters[4], cond_parameters[3]
    cond_parameters[5:5] = [1e-4]
    cond_parameters[6:6] = [16 if cond_parameters[3] == 'full' else 8]
    # cond_parameters[3] = 0 if cond_parameters[3] == 'full' else cond_parameters[3]
    
    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [196]:
args.lr_setting = 0
args.task = 'tasd'

stats_tasd['0'] = computePromptStatistics(args)
stats_tasd['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
31,tasd,para,GERestaurant,full,4,0.0003,16,20,0.7342,0.7304,0.58
56,tasd,mvp,GERestaurant,full,3,0.0001,16,20,0.7107,,
19,tasd,para,GERestaurant,full,5,0.0003,16,20,0.7043,0.6538,0.5436
65,tasd,mvp,GERestaurant,full,4,0.0001,16,20,0.7042,,
1,tasd,para,GERestaurant,full,3,0.0003,16,20,0.7028,0.6728,0.5417
60,tasd,mvp,GERestaurant,full,1,0.0001,16,20,0.7021,,
44,tasd,mvp,GERestaurant,full,5,0.0001,16,20,0.694,,
47,tasd,mvp,GERestaurant,full,2,0.0001,16,20,0.6918,,
20,tasd,para,GERestaurant,full,2,0.0003,16,20,0.6914,0.6669,0.5284
39,tasd,para,GERestaurant,full,1,0.0003,16,20,0.6867,0.6539,0.5229


Unnamed: 0,basic,context,cot,para,mvp
1,0.7123,0.7433,0.7502,0.6867,0.7021
2,0.7362,0.7346,0.7242,0.6914,0.6918
3,0.7672,0.7663,0.7386,0.7028,0.7107
4,0.7578,0.7625,0.7365,0.7342,0.7042
5,0.7832,0.7751,0.6755,0.7043,0.694


Unnamed: 0,W,pval,normal
context,0.931518,0.606756,True


Unnamed: 0,W,pval,normal
para,0.877092,0.29634,True


Unnamed: 0,W,pval,normal
mvp,0.947505,0.719361,True


    split   prompt      f1
0       1  context  0.7433
1       2  context  0.7346
2       3  context  0.7663
3       4  context  0.7625
4       5  context  0.7751
5       1     para  0.6867
6       2     para  0.6914
7       3     para  0.7028
8       4     para  0.7342
9       5     para  0.7043
10      1      mvp  0.7021
11      2      mvp  0.6918
12      3      mvp  0.7107
13      4      mvp  0.7042
14      5      mvp  0.6940
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc      ng2       eps
0  prompt      2      8  34.338098  0.000118  0.78137  0.987519


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,context vs para,0.75636,0.015045,0.70388,0.016562,6.936473,0.002268,0.004537,True
1,t-test,context vs mvp,0.75636,0.015045,0.70056,0.006902,7.781057,0.001471,0.004413,True
2,t-test,para vs mvp,0.70388,0.016562,0.70056,0.006902,0.419832,0.696178,0.696178,False


### 1000

In [197]:
args.lr_setting = 1000
args.task = 'tasd'

stats_tasd['1000'] = computePromptStatistics(args)
stats_tasd['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
4,tasd,para,GERestaurant,1000,3,0.0003,16,20,0.6759,0.6349,0.5104
16,tasd,para,GERestaurant,1000,5,0.0003,16,20,0.6756,0.6092,0.5101
64,tasd,mvp,GERestaurant,1000,5,0.0001,8,30,0.6701,,
2,tasd,para,GERestaurant,1000,2,0.0003,16,20,0.67,0.657,0.5037
22,tasd,para,GERestaurant,1000,4,0.0003,16,20,0.6646,0.6748,0.4977
61,tasd,mvp,GERestaurant,1000,4,0.0001,8,30,0.6632,,
66,tasd,mvp,GERestaurant,1000,1,0.0001,8,30,0.6619,,
52,tasd,mvp,GERestaurant,1000,3,0.0001,8,30,0.6536,,
42,tasd,mvp,GERestaurant,1000,2,0.0001,8,30,0.6484,,
3,tasd,para,GERestaurant,1000,1,0.0003,16,20,0.6431,0.6199,0.4739


Unnamed: 0,basic,context,cot,para,mvp
1,0.7067,0.7324,0.6451,0.6431,0.6619
2,0.7114,0.72,0.6805,0.67,0.6484
3,0.7405,0.7426,0.6989,0.6759,0.6536
4,0.7855,0.7792,0.7119,0.6646,0.6632
5,0.7572,0.716,0.6999,0.6756,0.6701


Unnamed: 0,W,pval,normal
basic,0.937161,0.645943,True


Unnamed: 0,W,pval,normal
para,0.816553,0.109799,True


Unnamed: 0,W,pval,normal
mvp,0.966897,0.854993,True


    split prompt      f1
0       1  basic  0.7067
1       2  basic  0.7114
2       3  basic  0.7405
3       4  basic  0.7855
4       5  basic  0.7572
5       1   para  0.6431
6       2   para  0.6700
7       3   para  0.6759
8       4   para  0.6646
9       5   para  0.6756
10      1    mvp  0.6619
11      2    mvp  0.6484
12      3    mvp  0.6536
13      4    mvp  0.6632
14      5    mvp  0.6701
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  30.006156  0.000191  0.791569  0.751512


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs para,0.74026,0.029301,0.66584,0.012103,5.612994,0.00495,0.010401,True
1,t-test,basic vs mvp,0.74026,0.029301,0.65944,0.007613,6.187712,0.003467,0.010401,True
2,t-test,para vs mvp,0.66584,0.012103,0.65944,0.007613,0.845959,0.445214,0.445214,False


### 500

In [198]:
args.lr_setting = 500
args.task = 'tasd'

stats_tasd['500'] = computePromptStatistics(args)
stats_tasd['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
41,tasd,mvp,GERestaurant,500,3,0.0001,8,50,0.6497,,
63,tasd,mvp,GERestaurant,500,5,0.0001,8,50,0.6494,,
7,tasd,para,GERestaurant,500,4,0.0003,16,86,0.6486,0.6585,0.48
28,tasd,para,GERestaurant,500,1,0.0003,16,86,0.6379,0.5965,0.4683
50,tasd,mvp,GERestaurant,500,2,0.0001,8,50,0.6357,,
45,tasd,mvp,GERestaurant,500,4,0.0001,8,50,0.6282,,
6,tasd,para,GERestaurant,500,3,0.0003,16,86,0.6233,0.566,0.4527
21,tasd,para,GERestaurant,500,5,0.0003,16,86,0.6208,0.5703,0.4502
69,tasd,mvp,GERestaurant,500,1,0.0001,8,50,0.6181,,
23,tasd,para,GERestaurant,500,2,0.0003,16,86,0.6098,0.5943,0.4387


Unnamed: 0,basic,context,cot,para,mvp
1,0.733,0.7354,0.6502,0.6379,0.6181
2,0.7087,0.7284,0.703,0.6098,0.6357
3,0.6768,0.7221,0.6869,0.6233,0.6497
4,0.722,0.7495,0.6749,0.6486,0.6282
5,0.6932,0.71,0.7015,0.6208,0.6494


Unnamed: 0,W,pval,normal
context,0.997633,0.998378,True


Unnamed: 0,W,pval,normal
para,0.964868,0.841405,True


Unnamed: 0,W,pval,normal
mvp,0.911641,0.477515,True


    split   prompt      f1
0       1  context  0.7354
1       2  context  0.7284
2       3  context  0.7221
3       4  context  0.7495
4       5  context  0.7100
5       1     para  0.6379
6       2     para  0.6098
7       3     para  0.6233
8       4     para  0.6486
9       5     para  0.6208
10      1      mvp  0.6181
11      2      mvp  0.6357
12      3      mvp  0.6497
13      4      mvp  0.6282
14      5      mvp  0.6494
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  63.028139  0.000013  0.925183  0.628415


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,context vs para,0.72908,0.013192,0.62808,0.01362,20.921693,3.1e-05,9.3e-05,True
1,t-test,context vs mvp,0.72908,0.013192,0.63622,0.012234,7.757898,0.001488,0.002976,True
2,t-test,para vs mvp,0.62808,0.01362,0.63622,0.012234,-0.705479,0.519429,0.519429,False


In [40]:
args.task = 'tasd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7067,0.733,0.7123
2,0.7114,0.7087,0.7362
3,0.7405,0.6768,0.7672
4,0.7855,0.722,0.7578
5,0.7572,0.6932,0.7832


Unnamed: 0,W,pval,normal
1000,0.937161,0.645943,True


Unnamed: 0,W,pval,normal
500,0.978466,0.926253,True


Unnamed: 0,W,pval,normal
full,0.973885,0.89955,True


    split prompt      f1
0       1   1000  0.7067
1       2   1000  0.7114
2       3   1000  0.7405
3       4   1000  0.7855
4       5   1000  0.7572
5       1    500  0.7330
6       2    500  0.7087
7       3    500  0.6768
8       4    500  0.7220
9       5    500  0.6932
10      1   full  0.7123
11      2   full  0.7362
12      3   full  0.7672
13      4   full  0.7578
14      5   full  0.7832
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.546072  0.078951  0.365519  0.698911
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.74026,0.029301,0.70674,0.02003,1.7585,0.153485,0.306971,False
1,t-test,1000 vs full,0.74026,0.029301,0.75134,0.024745,-1.059149,0.349252,0.349252,False
2,t-test,500 vs full,0.70674,0.02003,0.75134,0.024745,-2.127099,0.100535,0.301606,False


Unnamed: 0,1000,500,full
1,0.7324,0.7354,0.7433
2,0.72,0.7284,0.7346
3,0.7426,0.7221,0.7663
4,0.7792,0.7495,0.7625
5,0.716,0.71,0.7751


Unnamed: 0,W,pval,normal
1000,0.880566,0.311884,True


Unnamed: 0,W,pval,normal
500,0.997633,0.998378,True


Unnamed: 0,W,pval,normal
full,0.931518,0.606756,True


    split prompt      f1
0       1   1000  0.7324
1       2   1000  0.7200
2       3   1000  0.7426
3       4   1000  0.7792
4       5   1000  0.7160
5       1    500  0.7354
6       2    500  0.7284
7       3    500  0.7221
8       4    500  0.7495
9       5    500  0.7100
10      1   full  0.7433
11      2   full  0.7346
12      3   full  0.7663
13      4   full  0.7625
14      5   full  0.7751


  W = np.prod(eig) / (eig.sum() / d) ** d


Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.440272  0.083538  0.297767  0.766786
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.73804,0.022618,0.72908,0.013192,1.256814,0.277207,0.416486,False
1,t-test,1000 vs full,0.73804,0.022618,0.75636,0.015045,-1.499011,0.208243,0.416486,False
2,t-test,500 vs full,0.72908,0.013192,0.75636,0.015045,-2.330472,0.080211,0.240632,False


Unnamed: 0,1000,500,full
1,0.6451,0.6502,0.7502
2,0.6805,0.703,0.7242
3,0.6989,0.6869,0.7386
4,0.7119,0.6749,0.7365
5,0.6999,0.7015,0.6755


Unnamed: 0,W,pval,normal
1000,0.883812,0.326942,True


Unnamed: 0,W,pval,normal
500,0.907757,0.454228,True


Unnamed: 0,W,pval,normal
full,0.829285,0.137413,True


    split prompt      f1
0       1   1000  0.6451
1       2   1000  0.6805
2       3   1000  0.6989
3       4   1000  0.7119
4       5   1000  0.6999
5       1    500  0.6502
6       2    500  0.7030
7       3    500  0.6869
8       4    500  0.6749
9       5    500  0.7015
10      1   full  0.7502
11      2   full  0.7242
12      3   full  0.7386
13      4   full  0.7365
14      5   full  0.6755
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc     ng2       eps
0  prompt      2      8  3.263967  0.091948  0.3976  0.672939
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.68726,0.023348,0.6833,0.019486,0.399063,0.710234,0.710234,False
1,t-test,1000 vs full,0.68726,0.023348,0.725,0.026089,-1.818846,0.143069,0.357654,False
2,t-test,500 vs full,0.6833,0.019486,0.725,0.026089,-1.976943,0.119218,0.357654,False


Unnamed: 0,1000,500,full
1,0.7067,0.7354,0.7433
2,0.7114,0.7284,0.7346
3,0.7405,0.7221,0.7663
4,0.7855,0.7495,0.7625
5,0.7572,0.71,0.7751


Unnamed: 0,W,pval,normal
1000,0.937161,0.645943,True


Unnamed: 0,W,pval,normal
500,0.997633,0.998378,True


Unnamed: 0,W,pval,normal
full,0.931518,0.606756,True


    split prompt      f1
0       1   1000  0.7067
1       2   1000  0.7114
2       3   1000  0.7405
3       4   1000  0.7855
4       5   1000  0.7572
5       1    500  0.7354
6       2    500  0.7284
7       3    500  0.7221
8       4    500  0.7495
9       5    500  0.7100
10      1   full  0.7433
11      2   full  0.7346
12      3   full  0.7663
13      4   full  0.7625
14      5   full  0.7751
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2       F     p-unc       ng2       eps
0  prompt      2      8  2.4564  0.147326  0.230039  0.839269
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.74026,0.029301,0.72908,0.013192,0.758064,0.490621,0.490621,False
1,t-test,1000 vs full,0.74026,0.029301,0.75636,0.015045,-1.572398,0.190962,0.381923,False
2,t-test,500 vs full,0.72908,0.013192,0.75636,0.015045,-2.330472,0.080211,0.240632,False


## Create Latex

In [42]:
import pandas as pd

def extract_means_and_stds(stats):
    """ 
    Extract the mean and std values for two methods from the statistical dataframe.
    """
    # Initialize a dictionary to store extracted values
    results = {
        "acd": {},
        "acsa": {},
        "e2e": {},
        "tasd": {}
    }
    for task, dfs in stats.items():
        for lr_setting, df in dfs.items():
            if df is not None:
                results[task][lr_setting] = {}
                # Find the row in the dataframe that corresponds to the comparison between method1 and method2
                for _, row in df.iterrows():
                    comparison = row['comparison']
                    mean1, mean2 = row['mean 1'], row['mean 2']
                    std1, std2 = row['std 1'], row['std 2']
            
                    # Map the means and stds to the correct methods
                    method1, method2 = comparison.split(' vs ')
                    
                    if method1 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method1] = {'mean': None, 'std': None}
                        results[task][lr_setting][method1]['mean'], results[task][lr_setting][method1]['std'] = mean1*100, std1*100
                        
                    if method2 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method2] = {'mean': None, 'std': None} 
                        results[task][lr_setting][method2]['mean'], results[task][lr_setting][method2]['std'] = mean2*100, std2*100
    
    return results

def create_full_latex_row(task_results, resource_setting):
    """
    Creates a full LaTeX row for a specific resource setting across all tasks.
    
    Parameters:
    - task_results: A dictionary containing results_dicts for all tasks (e.g., {'ACD': results_dict1, 'ACSA': results_dict2, 'tasd': results_dict3}).
    - resource_setting: The resource setting (e.g., 'Full', '1000', '500').
    
    Returns:
    - A LaTeX formatted string representing a full row of the table.
    """

    rs_text = resource_setting if resource_setting != '0' else 'Full'
    latex_row = r"\multicolumn{1}{r|}{" + rs_text + "} & "

    for task, results_dict in task_results.items():
        if task in ['acd', 'acsa']:
            task_methods = ['mlcf', 'hier-gcn']
        elif task == 'e2e':
            task_methods = ['instructAbsa', 'tas-bert']
        elif task == 'tasd':
            task_methods = ['e2tp', 'para']
            
        if resource_setting in results_dict.keys():
            methods = results_dict[resource_setting]
            # Find the method with the highest mean value among short, long, cot
            highest_prompt = [prompt_style for prompt_style in methods if prompt_style in ['short', 'long', 'cot']][0]
            highest_method = max(task_methods + [highest_prompt], key=lambda x: methods[x]['mean'])
            # Initialize LaTeX row string
            if highest_prompt == highest_method:
                latex_row += (
                    r"\scalebox{0.95}{\textbf{" + f"{methods[highest_prompt]['mean']:.2f}" + "}} & "
                )
            else:
                latex_row += (
                    f"{methods[highest_prompt]['mean']:.2f}" + " & "
                )
            
            # Add the remaining methods
            for i, method in enumerate(task_methods):
                if method in methods.keys():
                    if method == highest_method:
                        latex_row += (
                            r"\multicolumn{1}{c" + f"{'|' if(i == 1 and task != 'tasd') else ''}" +
                            r"}{\scalebox{0.95}{\textbf{" + f"{methods[method]['mean']:.2f}" +
                            r"}}} & "
                        )
                    else:
                        latex_row += (
                            r"\multicolumn{1}{c" + f"{'|' if(i == 1 and task != 'tasd') else ''}" +
                            r"}{" + f"{methods[method]['mean']:.2f}" +
                            r"} & "
                        )
                else:
                    latex_row += r"\multicolumn{1}{l|}{N/A} & "
            
    # Remove the trailing '&' and replace with '\\'
    latex_row = latex_row.rstrip(" & ") + r" \\"
    
    return latex_row
    
results_dict = extract_means_and_stds({'acd':stats_acd, 'acsa':stats_acsa, 'e2e': stats_e2e, 'tasd':stats_tasd})

latex = []
latex.append(create_full_latex_row(results_dict, '0'))
latex.append(create_full_latex_row(results_dict, '1000'))
latex.append(create_full_latex_row(results_dict, '500'))

for l in latex:
    print(l)
    print("&")


\multicolumn{1}{r|}{Full} & 87.88 & \multicolumn{1}{c}{\scalebox{0.95}{\textbf{92.29}}} & \multicolumn{1}{c|}{89.71} & \scalebox{0.95}{\textbf{83.64}} & \multicolumn{1}{c}{83.17} & \multicolumn{1}{c|}{82.49} & \scalebox{0.95}{\textbf{80.60}} & \multicolumn{1}{c}{71.50} & \multicolumn{1}{c|}{71.05} & \scalebox{0.95}{\textbf{75.64}} & \multicolumn{1}{c}{69.51} & \multicolumn{1}{c}{70.39} \\
&
\multicolumn{1}{r|}{1000} & 86.65 & \multicolumn{1}{c}{\scalebox{0.95}{\textbf{91.22}}} & \multicolumn{1}{c|}{87.41} & 80.57 & \multicolumn{1}{c}{\scalebox{0.95}{\textbf{84.36}}} & \multicolumn{1}{c|}{79.25} & \scalebox{0.95}{\textbf{78.43}} & \multicolumn{1}{c}{69.96} & \multicolumn{1}{c|}{66.66} & \scalebox{0.95}{\textbf{74.03}} & \multicolumn{1}{c}{66.19} & \multicolumn{1}{c}{66.58} \\
&
\multicolumn{1}{r|}{500} & 86.12 & \multicolumn{1}{c}{\scalebox{0.95}{\textbf{90.13}}} & \multicolumn{1}{c|}{85.88} & \scalebox{0.95}{\textbf{81.83}} & \multicolumn{1}{c}{81.69} & \multicolumn{1}{c|}{76.62} & \sc

In [38]:
results_dict

{'acd': {'0': {'short': {'mean': 87.876, 'std': 0.9352133446438845},
   'long': {'mean': 87.82799999999999, 'std': 0.8478537609753235},
   'hier-gcn': {'mean': 89.71441699450195, 'std': 0.7938397466859362},
   'mlcf': {'mean': 92.294, 'std': 0.6201806188522835}},
  '1000': {'short': {'mean': 86.64999999999999, 'std': 1.8934941246277988},
   'long': {'mean': 86.44800000000001, 'std': 1.9051761073454596},
   'hier-gcn': {'mean': 87.41157913823318, 'std': 0.98652357127297},
   'mlcf': {'mean': 91.22200000000001, 'std': 0.882142845575477}},
  '500': {'short': {'mean': 86.11999999999999, 'std': 2.0057517294022196},
   'long': {'mean': 83.23599999999999, 'std': 0.7362227923665495},
   'hier-gcn': {'mean': 85.8844883418226, 'std': 1.5575183001170727},
   'mlcf': {'mean': 90.126, 'std': 0.615844136125365}}},
 'acsa': {'0': {'short': {'mean': 83.64, 'std': 1.5432303781354233},
   'long': {'mean': 83.21600000000001, 'std': 2.9146842024480124},
   'cot': {'mean': 82.53999999999999, 'std': 1.43481

## Performance Comparison of Extraction of ABSA-Tuple Elements over different ABSA Subtasks

In [151]:
# Additional Eval

runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        filename = 'metrics_asp.tsv'
        
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'short'), ('full', 'long'), ('full', 'cot'),
    ('1000', 'short'), ('1000', 'long'), ('1000', 'cot'),
    ('500', 'short'), ('500', 'long'), ('500', 'cot')
])

# Define the row indices
index = ['acd', 'acsa', 'tasd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'tasd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                         results_all['task'] == task,
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting])].groupby(['model_config']):
            prompt_name = a[0].split('_')[2]
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect Extraction')
display(df)

print(f"Average difference ACSA to ACD: {(np.mean([float(i) for i in list(df.loc['acsa'])]) - np.mean([float(i) for i in list(df.loc['acd']) if i != 'N/A'])):.2f}")

print(f"Average difference TASD to ACSA: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}")

print(f"Average difference TASD to ACD: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acd'])  if i != 'N/A'])):.2f}")

Aspect Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,short,long,cot,short,long,cot,short,long,cot
acd,87.87,87.83,,86.65,86.45,,86.12,83.24,
acsa,86.58,86.34,85.77,83.16,79.17,83.52,82.52,82.96,85.09
tasd,87.01,87.69,86.63,87.14,86.11,85.32,86.6,86.16,83.81


Average difference ACSA to ACD: -2.46
Average difference TASD to ACSA: 2.37
Average difference TASD to ACD: -0.09


In [145]:
# Additional Eval

runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        filename = 'metrics_asp_pol.tsv'
        
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'short'), ('full', 'long'), ('full', 'cot'),
    ('1000', 'short'), ('1000', 'long'), ('1000', 'cot'),
    ('500', 'short'), ('500', 'long'), ('500', 'cot')
])

# Define the row indices
index = ['acd', 'acsa', 'tasd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'tasd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                         results_all['task'] == task,
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting])].groupby(['model_config']):

            prompt_name = a[0].split('_')[2]
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect + Polarity Extraction')
display(df)

f"Average difference: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}"


Aspect + Polarity Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,short,long,cot,short,long,cot,short,long,cot
acd,,,,,,,,,
acsa,83.64,83.22,82.54,79.61,76.24,80.57,78.91,80.11,81.83
tasd,83.75,85.1,83.46,83.89,83.27,82.25,82.34,82.96,80.19


'Average difference: 2.28'

In [150]:
# Eval for best parameter combination over all tasks and dataset sizes

RESULTS_PATH = '../results/ft_llm/'
DATASET = 'GERestaurant'

col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

results_sub = results_all[np.logical_and.reduce([results_all['dataset'] == DATASET, results_all['split'] == '0'])].sort_values(by = ['f1-micro'], ascending = False)
results_sub = results_sub[results_sub['lr_setting'] != 'orig']
results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lr_setting', 'lora_r', 'lora_alpha', 'epoch', 'f1-micro', 'f1-macro']]
results_sub = results_sub.reset_index()

idx_max = results_sub.groupby(['lr_setting', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].idxmax()
results_per_epoch = results_sub.loc[idx_max]

results_per_epoch.groupby(['learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].mean()

learning_rate  lora_r  lora_alpha
0.0003         32      32            0.818427
                       64            0.782661
               8       16            0.825533
                       8             0.830473
3e-05          32      32            0.818609
                       64            0.823300
               8       16            0.812015
                       8             0.798288
Name: f1-micro, dtype: float64