## Language

In [26]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = {
    'dataset': 'GERestaurant'
}

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_tasd = {}

args = SimpleNamespace(**args)

RESULTS_PATH = '../results/ft_llm/'
N_SAMPLES = 1000

col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting])].sort_values(by = ['f1-micro'], ascending = False)
    
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
        
    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['basic', 'context']
    else:
        prompts = ['basic', 'context', 'cot']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,12]
            f1_prompts[prompt] = f1
        except:
            pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)

    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_prompts[col1]),
            'std 1': np.std(df_prompts[col1]),
            'mean 2': np.mean(df_prompts[col2]),
            'std 2': np.std(df_prompts[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df

In [27]:
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e' or cond_parameters[0] == 'e2e-e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

args.results = results_all

## ACD

### Full Dataset

In [28]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

Unnamed: 0,basic,context
1,0.8748,0.8801
2,0.8663,0.8698
3,0.8757,0.8674
4,0.8943,0.8895
5,0.8827,0.8846


Unnamed: 0,W,pval,normal
basic,0.962545,0.825588,True


Unnamed: 0,W,pval,normal
context,0.927728,0.580963,True


   split   prompt      f1
0      1    basic  0.8748
1      2    basic  0.8663
2      3    basic  0.8757
3      4    basic  0.8943
4      5    basic  0.8827
5      1  context  0.8801
6      2  context  0.8698
7      3  context  0.8674
8      4  context  0.8895
9      5  context  0.8846
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2  eps
0  prompt      1      4  0.034202  0.862276  0.000722  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.87876,0.009352,0.87828,0.008479,0.184938,0.862276,0.862276,False


### 1000

In [29]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

Unnamed: 0,basic,context
1,0.8798,0.8698
2,0.8423,0.8509
3,0.8625,0.8555
4,0.8952,0.8993
5,0.8527,0.8469


Unnamed: 0,W,pval,normal
basic,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
context,0.85488,0.210443,True


   split   prompt      f1
0      1    basic  0.8798
1      2    basic  0.8423
2      3    basic  0.8625
3      4    basic  0.8952
4      5    basic  0.8527
5      1  context  0.8698
6      2  context  0.8509
7      3  context  0.8555
8      4  context  0.8993
9      5  context  0.8469
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2  eps
0  prompt      1      4  0.322551  0.600441  0.00282  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.8665,0.018935,0.86448,0.019052,0.567936,0.600441,0.600441,False


### 500

In [30]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

Unnamed: 0,basic,context
1,0.887,0.8369
2,0.8264,0.8215
3,0.8558,0.8258
4,0.8674,0.8406
5,0.8694,0.837


Unnamed: 0,W,pval,normal
basic,0.943288,0.689265,True


Unnamed: 0,W,pval,normal
context,0.874845,0.286601,True


   split   prompt      f1
0      1    basic  0.8870
1      2    basic  0.8264
2      3    basic  0.8558
3      4    basic  0.8674
4      5    basic  0.8694
5      1  context  0.8369
6      2  context  0.8215
7      3  context  0.8258
8      4  context  0.8406
9      5  context  0.8370
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2  eps
0  prompt      1      4  15.944637  0.016223  0.476711  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.8612,0.020058,0.83236,0.007362,3.993074,0.016223,0.016223,True


## ACSA

### Full Dataset

In [31]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

Unnamed: 0,basic,context,cot
1,0.8348,0.8439,0.8532
2,0.8256,0.7776,0.8123
3,0.8226,0.8356,0.8187
4,0.8659,0.865,0.8234
5,0.8331,0.8387,0.8194


Unnamed: 0,W,pval,normal
basic,0.807002,0.092301,True


Unnamed: 0,W,pval,normal
context,0.850142,0.194967,True


Unnamed: 0,W,pval,normal
cot,0.779059,0.054104,True


    split   prompt      f1
0       1    basic  0.8348
1       2    basic  0.8256
2       3    basic  0.8226
3       4    basic  0.8659
4       5    basic  0.8331
5       1  context  0.8439
6       2  context  0.7776
7       3  context  0.8356
8       4  context  0.8650
9       5  context  0.8387
10      1      cot  0.8532
11      2      cot  0.8123
12      3      cot  0.8187
13      4      cot  0.8234
14      5      cot  0.8194
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  0.469407  0.641564  0.045427  0.896168


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.8364,0.015432,0.83216,0.029147,0.379382,0.723685,1.0,False
1,t-test,basic vs cot,0.8364,0.015432,0.8254,0.014348,1.1228,0.324358,0.973073,False
2,t-test,context vs cot,0.83216,0.029147,0.8254,0.014348,0.514642,0.633934,1.0,False


### 1000

In [32]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

Unnamed: 0,basic,context,cot
1,0.8314,0.7744,0.773
2,0.7488,0.7457,0.802
3,0.8365,0.7368,0.802
4,0.8479,0.7973,0.8299
5,0.7157,0.7579,0.8216


Unnamed: 0,W,pval,normal
basic,0.833497,0.147732,True


Unnamed: 0,W,pval,normal
context,0.959187,0.802308,True


Unnamed: 0,W,pval,normal
cot,0.935542,0.634617,True


    split   prompt      f1
0       1    basic  0.8314
1       2    basic  0.7488
2       3    basic  0.8365
3       4    basic  0.8479
4       5    basic  0.7157
5       1  context  0.7744
6       2  context  0.7457
7       3  context  0.7368
8       4  context  0.7973
9       5  context  0.7579
10      1      cot  0.7730
11      2      cot  0.8020
12      3      cot  0.8020
13      4      cot  0.8299
14      5      cot  0.8216
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.845857  0.219203  0.218051  0.613688


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.79606,0.05341,0.76242,0.021524,1.38017,0.239653,0.479306,False
1,t-test,basic vs cot,0.79606,0.05341,0.8057,0.019673,-0.316917,0.767158,0.767158,False
2,t-test,context vs cot,0.76242,0.021524,0.8057,0.019673,-3.433759,0.026444,0.079333,False


### 500

In [33]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

Unnamed: 0,basic,context,cot
1,0.7748,0.8249,0.8168
2,0.7871,0.7988,0.7859
3,0.7951,0.7935,0.827
4,0.8316,0.8387,0.8436
5,0.7568,0.7496,0.8182


Unnamed: 0,W,pval,normal
basic,0.964037,0.835776,True


Unnamed: 0,W,pval,normal
context,0.950268,0.7391,True


Unnamed: 0,W,pval,normal
cot,0.942179,0.681379,True


    split   prompt      f1
0       1    basic  0.7748
1       2    basic  0.7871
2       3    basic  0.7951
3       4    basic  0.8316
4       5    basic  0.7568
5       1  context  0.8249
6       2  context  0.7988
7       3  context  0.7935
8       4  context  0.8387
9       5  context  0.7496
10      1      cot  0.8168
11      2      cot  0.7859
12      3      cot  0.8270
13      4      cot  0.8436
14      5      cot  0.8182
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  2.856066  0.115862  0.184125  0.780724


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.78908,0.024876,0.8011,0.030645,-1.193289,0.298694,0.597387,False
1,t-test,basic vs cot,0.78908,0.024876,0.8183,0.018803,-2.652001,0.056863,0.170589,False
2,t-test,context vs cot,0.8011,0.030645,0.8183,0.018803,-1.133514,0.32033,0.597387,False


# E2E

In [34]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

Unnamed: 0,basic,context,cot
1,0.7923,0.7876,0.7668
2,0.7865,0.75,0.7689
3,0.8,0.7861,0.765
4,0.8335,0.8187,0.7888
5,0.8178,0.8004,0.7719


Unnamed: 0,W,pval,normal
basic,0.9325,0.613512,True


Unnamed: 0,W,pval,normal
context,0.948692,0.727844,True


Unnamed: 0,W,pval,normal
cot,0.79095,0.068237,True


    split   prompt      f1
0       1    basic  0.7923
1       2    basic  0.7865
2       3    basic  0.8000
3       4    basic  0.8335
4       5    basic  0.8178
5       1  context  0.7876
6       2  context  0.7500
7       3  context  0.7861
8       4  context  0.8187
9       5  context  0.8004
10      1      cot  0.7668
11      2      cot  0.7689
12      3      cot  0.7650
13      4      cot  0.7888
14      5      cot  0.7719
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  12.382448  0.003554  0.392355  0.635156


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.80602,0.017317,0.78856,0.02255,3.344447,0.028717,0.057433,False
1,t-test,basic vs cot,0.80602,0.017317,0.77228,0.008573,6.17049,0.003503,0.010508,True
2,t-test,context vs cot,0.78856,0.02255,0.77228,0.008573,1.811055,0.14437,0.14437,False


In [35]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,basic,context,cot
1,0.7992,0.7785,0.7288
2,0.7455,0.7703,0.7223
3,0.7953,0.7758,0.7618
4,0.8136,0.8069,0.7614
5,0.7681,0.7893,0.7713


Unnamed: 0,W,pval,normal
basic,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
context,0.908464,0.458414,True


Unnamed: 0,W,pval,normal
cot,0.848695,0.190429,True


    split   prompt      f1
0       1    basic  0.7992
1       2    basic  0.7455
2       3    basic  0.7953
3       4    basic  0.8136
4       5    basic  0.7681
5       1  context  0.7785
6       2  context  0.7703
7       3  context  0.7758
8       4  context  0.8069
9       5  context  0.7893
10      1      cot  0.7288
11      2      cot  0.7223
12      3      cot  0.7618
13      4      cot  0.7614
14      5      cot  0.7713
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  7.844823  0.013005  0.417371  0.776503


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.78434,0.02437,0.78416,0.012943,0.018381,0.986215,0.986215,False
1,t-test,basic vs cot,0.78434,0.02437,0.74912,0.019676,2.806625,0.048483,0.096966,False
2,t-test,context vs cot,0.78416,0.012943,0.74912,0.019676,4.47664,0.011018,0.033055,True


In [36]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,basic,context,cot
1,0.7458,0.7271,0.6693
2,0.7606,0.7129,0.7179
3,0.6998,0.6775,0.6729
4,0.7563,0.7069,0.7082
5,0.7301,0.7094,0.7154


Unnamed: 0,W,pval,normal
basic,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
context,0.900947,0.415142,True


Unnamed: 0,W,pval,normal
cot,0.80553,0.089826,True


    split   prompt      f1
0       1    basic  0.7458
1       2    basic  0.7606
2       3    basic  0.6998
3       4    basic  0.7563
4       5    basic  0.7301
5       1  context  0.7271
6       2  context  0.7129
7       3  context  0.6775
8       4  context  0.7069
9       5  context  0.7094
10      1      cot  0.6693
11      2      cot  0.7179
12      3      cot  0.6729
13      4      cot  0.7082
14      5      cot  0.7154
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  9.403763  0.007931  0.442679  0.754633


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.73852,0.022031,0.70676,0.016215,4.613979,0.009926,0.029778,True
1,t-test,basic vs cot,0.73852,0.022031,0.69674,0.021206,3.982654,0.016365,0.03273,True
2,t-test,context vs cot,0.70676,0.016215,0.69674,0.021206,0.82888,0.453768,0.453768,False


### E2E - Explicit only

In [37]:
args.lr_setting = 0
args.task = 'e2e-e'

computePromptStatistics(args)

Unnamed: 0,basic,context,cot
1,0.8237,0.8284,0.8173
2,0.7497,0.6943,0.7918
3,0.8431,0.8365,0.852
4,0.7857,0.8154,0.7652
5,0.8039,0.7256,0.8055


Unnamed: 0,W,pval,normal
basic,0.983488,0.952373,True


Unnamed: 0,W,pval,normal
context,0.833036,0.146574,True


Unnamed: 0,W,pval,normal
cot,0.991887,0.985867,True


    split   prompt      f1
0       1    basic  0.8237
1       2    basic  0.7497
2       3    basic  0.8431
3       4    basic  0.7857
4       5    basic  0.8039
5       1  context  0.8284
6       2  context  0.6943
7       3  context  0.8365
8       4  context  0.8154
9       5  context  0.7256
10      1      cot  0.8173
11      2      cot  0.7918
12      3      cot  0.8520
13      4      cot  0.7652
14      5      cot  0.8055
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2       eps
0  prompt      2      8  0.919558  0.437055  0.06875  0.553783


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.80122,0.03213,0.78004,0.058467,1.065356,0.34675,1.0,False
1,t-test,basic vs cot,0.80122,0.03213,0.80636,0.028669,-0.491951,0.64853,1.0,False
2,t-test,context vs cot,0.78004,0.058467,0.80636,0.028669,-0.951289,0.395326,1.0,False


In [38]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,basic,context,cot
1,0.7992,0.7785,0.7288
2,0.7455,0.7703,0.7223
3,0.7953,0.7758,0.7618
4,0.8136,0.8069,0.7614
5,0.7681,0.7893,0.7713


Unnamed: 0,W,pval,normal
basic,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
context,0.908464,0.458414,True


Unnamed: 0,W,pval,normal
cot,0.848695,0.190429,True


    split   prompt      f1
0       1    basic  0.7992
1       2    basic  0.7455
2       3    basic  0.7953
3       4    basic  0.8136
4       5    basic  0.7681
5       1  context  0.7785
6       2  context  0.7703
7       3  context  0.7758
8       4  context  0.8069
9       5  context  0.7893
10      1      cot  0.7288
11      2      cot  0.7223
12      3      cot  0.7618
13      4      cot  0.7614
14      5      cot  0.7713
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  7.844823  0.013005  0.417371  0.776503


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.78434,0.02437,0.78416,0.012943,0.018381,0.986215,0.986215,False
1,t-test,basic vs cot,0.78434,0.02437,0.74912,0.019676,2.806625,0.048483,0.096966,False
2,t-test,context vs cot,0.78416,0.012943,0.74912,0.019676,4.47664,0.011018,0.033055,True


In [39]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,basic,context,cot
1,0.7458,0.7271,0.6693
2,0.7606,0.7129,0.7179
3,0.6998,0.6775,0.6729
4,0.7563,0.7069,0.7082
5,0.7301,0.7094,0.7154


Unnamed: 0,W,pval,normal
basic,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
context,0.900947,0.415142,True


Unnamed: 0,W,pval,normal
cot,0.80553,0.089826,True


    split   prompt      f1
0       1    basic  0.7458
1       2    basic  0.7606
2       3    basic  0.6998
3       4    basic  0.7563
4       5    basic  0.7301
5       1  context  0.7271
6       2  context  0.7129
7       3  context  0.6775
8       4  context  0.7069
9       5  context  0.7094
10      1      cot  0.6693
11      2      cot  0.7179
12      3      cot  0.6729
13      4      cot  0.7082
14      5      cot  0.7154
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  9.403763  0.007931  0.442679  0.754633


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.73852,0.022031,0.70676,0.016215,4.613979,0.009926,0.029778,True
1,t-test,basic vs cot,0.73852,0.022031,0.69674,0.021206,3.982654,0.016365,0.03273,True
2,t-test,context vs cot,0.70676,0.016215,0.69674,0.021206,0.82888,0.453768,0.453768,False


## TASD

### Full Dataset

In [40]:
args.lr_setting = 0
args.task = 'tasd'

stats_tasd['0'] = computePromptStatistics(args)
stats_tasd['0']

Unnamed: 0,basic,context,cot
1,0.7123,0.7433,0.7502
2,0.7362,0.7346,0.7242
3,0.7672,0.7663,0.7386
4,0.7578,0.7625,0.7365
5,0.7832,0.7751,0.6755


Unnamed: 0,W,pval,normal
basic,0.973885,0.89955,True


Unnamed: 0,W,pval,normal
context,0.931518,0.606756,True


Unnamed: 0,W,pval,normal
cot,0.829285,0.137413,True


    split   prompt      f1
0       1    basic  0.7123
1       2    basic  0.7362
2       3    basic  0.7672
3       4    basic  0.7578
4       5    basic  0.7832
5       1  context  0.7433
6       2  context  0.7346
7       3  context  0.7663
8       4  context  0.7625
9       5  context  0.7751
10      1      cot  0.7502
11      2      cot  0.7242
12      3      cot  0.7386
13      4      cot  0.7365
14      5      cot  0.6755
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.839446  0.220168  0.271939  0.529002


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.75134,0.024745,0.75636,0.015045,-0.73773,0.501616,0.647776,False
1,t-test,basic vs cot,0.75134,0.024745,0.725,0.026089,1.124042,0.323888,0.647776,False
2,t-test,context vs cot,0.75636,0.015045,0.725,0.026089,1.726053,0.159417,0.478251,False


### 1000

In [41]:
args.lr_setting = 1000
args.task = 'tasd'

stats_tasd['1000'] = computePromptStatistics(args)
stats_tasd['1000']

Unnamed: 0,basic,context,cot
1,0.7067,0.7324,0.6451
2,0.7114,0.72,0.6805
3,0.7405,0.7426,0.6989
4,0.7855,0.7792,0.7119
5,0.7572,0.716,0.6999


Unnamed: 0,W,pval,normal
basic,0.937161,0.645943,True


Unnamed: 0,W,pval,normal
context,0.880566,0.311884,True


Unnamed: 0,W,pval,normal
cot,0.883812,0.326942,True


    split   prompt      f1
0       1    basic  0.7067
1       2    basic  0.7114
2       3    basic  0.7405
3       4    basic  0.7855
4       5    basic  0.7572
5       1  context  0.7324
6       2  context  0.7200
7       3  context  0.7426
8       4  context  0.7792
9       5  context  0.7160
10      1      cot  0.6451
11      2      cot  0.6805
12      3      cot  0.6989
13      4      cot  0.7119
14      5      cot  0.6999
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  16.407606  0.001476  0.484141  0.798637


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.74026,0.029301,0.73804,0.022618,0.200524,0.850854,0.850854,False
1,t-test,basic vs cot,0.74026,0.029301,0.68726,0.023348,7.034909,0.002152,0.006455,True
2,t-test,context vs cot,0.73804,0.022618,0.68726,0.023348,4.155399,0.014198,0.028396,True


### 500

In [42]:
args.lr_setting = 500
args.task = 'tasd'

stats_tasd['500'] = computePromptStatistics(args)
stats_tasd['500']

Unnamed: 0,basic,context,cot
1,0.733,0.7354,0.6502
2,0.7087,0.7284,0.703
3,0.6768,0.7221,0.6869
4,0.722,0.7495,0.6749
5,0.6932,0.71,0.7015


Unnamed: 0,W,pval,normal
basic,0.978466,0.926253,True


Unnamed: 0,W,pval,normal
context,0.997633,0.998378,True


Unnamed: 0,W,pval,normal
cot,0.907757,0.454228,True


    split   prompt      f1
0       1    basic  0.7330
1       2    basic  0.7087
2       3    basic  0.6768
3       4    basic  0.7220
4       5    basic  0.6932
5       1  context  0.7354
6       2  context  0.7284
7       3  context  0.7221
8       4  context  0.7495
9       5  context  0.7100
10      1      cot  0.6502
11      2      cot  0.7030
12      3      cot  0.6869
13      4      cot  0.6749
14      5      cot  0.7015
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  5.319825  0.033932  0.523259  0.599647


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.70674,0.02003,0.72908,0.013192,-3.176791,0.033638,0.100915,False
1,t-test,basic vs cot,0.70674,0.02003,0.6833,0.019486,1.29718,0.264329,0.264329,False
2,t-test,context vs cot,0.72908,0.013192,0.6833,0.019486,3.121586,0.035472,0.100915,False


In [43]:
import pandas as pd

def extract_means_and_stds(stats):
    """ 
    Extract the mean and std values for two methods from the statistical dataframe.
    """
    # Initialize a dictionary to store extracted values
    results = {
        "acd": {},
        "acsa": {},
        "e2e": {},
        "tasd": {}
    }
    for task, dfs in stats.items():
        for lr_setting, df in dfs.items():
            if df is not None:
                results[task][lr_setting] = {}
                # Find the row in the dataframe that corresponds to the comparison between method1 and method2
                for _, row in df.iterrows():
                    comparison = row['comparison']
                    mean1, mean2 = row['mean 1'], row['mean 2']
                    std1, std2 = row['std 1'], row['std 2']
            
                    # Map the means and stds to the correct methods
                    method1, method2 = comparison.split(' vs ')
                    
                    if method1 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method1] = {'mean': None, 'std': None}
                        results[task][lr_setting][method1]['mean'], results[task][lr_setting][method1]['std'] = mean1*100, std1*100
                        
                    if method2 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method2] = {'mean': None, 'std': None} 
                        results[task][lr_setting][method2]['mean'], results[task][lr_setting][method2]['std'] = mean2*100, std2*100
    
    return results

def create_full_latex_row(task_results, resource_setting):
    """
    Creates a full LaTeX row for a specific resource setting across all tasks.
    
    Parameters:
    - task_results: A dictionary containing results_dicts for all tasks (e.g., {'ACD': results_dict1, 'ACSA': results_dict2, 'TASD': results_dict3}).
    - resource_setting: The resource setting (e.g., 'Full', '1000', '500').
    
    Returns:
    - A LaTeX formatted string representing a full row of the table.
    """

    rs_text = resource_setting if resource_setting != '0' else 'Full'
    latex_row = r"\multicolumn{1}{r|}{" + rs_text + "} & "

    for task, results_dict in task_results.items():
        prompt_styles = ['basic', 'context'] if task == 'acd' else ['basic', 'context', 'cot']
        if resource_setting in results_dict.keys():
            prompts = results_dict[resource_setting]
            # Find the method with the highest mean value among short, long, cot
            highest_prompt = max(['basic', 'context'], key=lambda x: prompts[x]['mean']) if task == 'acd' else max(['basic', 'context', 'cot'], key=lambda x: prompts[x]['mean'])
            # Initialize LaTeX row string
            if prompt_styles[0] in prompts.keys():
                if prompt_styles[0] == highest_prompt:
                    latex_row += (
                        r"\textbf{" + f"{prompts[prompt_styles[0]]['mean']:.2f}" +
                        r"}\textsubscript{ \textit{" + f"{prompts[prompt_styles[0]]['std']:.2f}" + "}} & "
                    )
                else:
                    latex_row += (
                        r"" + f"{prompts[prompt_styles[0]]['mean']:.2f}" +
                        r"\textsubscript{ \textit{" + f"{prompts[prompt_styles[0]]['std']:.2f}" + "}} & "
                    )
            else:
                latex_row += r"\multicolumn{1}{l|}{N/A} & "
            
            # Add the remaining methods
            for i, prompt_style in enumerate(prompt_styles[1:]):
                
                if prompt_style in prompts.keys():
                    if prompt_style == highest_prompt:
                        latex_row += (
                            r"\multicolumn{1}{l" + f"{'|' if((i == 1 and task != 'tasd') or (i == 0 and task == 'acd')) else ''}" +
                            r"}{\textbf{" + f"{prompts[prompt_style]['mean']:.2f}" +
                            r"}\textsubscript{ \textit{" + f"{prompts[prompt_style]['std']:.2f}" + "}}} & "
                        )
                    else:
                        latex_row += (
                            r"\multicolumn{1}{l" + f"{'|' if((i == 1 and task != 'tasd') or (i == 0 and task == 'acd')) else ''}" +
                            r"}{" + f"{prompts[prompt_style]['mean']:.2f}" +
                            r"\textsubscript{ \textit{" + f"{prompts[prompt_style]['std']:.2f}" + "}}} & "
                        )
                else:
                    latex_row += r"\multicolumn{1}{l|}{N/A} & "
            
    # Remove the trailing '&' and replace with '\\'
    latex_row = latex_row.rstrip(" & ") + r" \\"
    
    return latex_row
    
results_dict = extract_means_and_stds({'acd':stats_acd, 'acsa':stats_acsa, 'e2e':stats_e2e, 'tasd':stats_tasd})

latex = []
latex.append(create_full_latex_row(results_dict, '0'))
latex.append(create_full_latex_row(results_dict, '1000'))
latex.append(create_full_latex_row(results_dict, '500'))

for l in latex:
    print(l)
    print("&")


\multicolumn{1}{r|}{Full} & \textbf{87.88}\textsubscript{ \textit{0.94}} & \multicolumn{1}{l|}{87.83\textsubscript{ \textit{0.85}}} & \textbf{83.64}\textsubscript{ \textit{1.54}} & \multicolumn{1}{l}{83.22\textsubscript{ \textit{2.91}}} & \multicolumn{1}{l|}{82.54\textsubscript{ \textit{1.43}}} & \textbf{80.60}\textsubscript{ \textit{1.73}} & \multicolumn{1}{l}{78.86\textsubscript{ \textit{2.25}}} & \multicolumn{1}{l|}{77.23\textsubscript{ \textit{0.86}}} & 75.13\textsubscript{ \textit{2.47}} & \multicolumn{1}{l}{\textbf{75.64}\textsubscript{ \textit{1.50}}} & \multicolumn{1}{l}{72.50\textsubscript{ \textit{2.61}}} \\
&
\multicolumn{1}{r|}{1000} & \textbf{86.65}\textsubscript{ \textit{1.89}} & \multicolumn{1}{l|}{86.45\textsubscript{ \textit{1.91}}} & 79.61\textsubscript{ \textit{5.34}} & \multicolumn{1}{l}{76.24\textsubscript{ \textit{2.15}}} & \multicolumn{1}{l|}{\textbf{80.57}\textsubscript{ \textit{1.97}}} & \textbf{78.43}\textsubscript{ \textit{2.44}} & \multicolumn{1}{l}{78.42\te