## Language

In [26]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = {
    'dataset': 'rest-16'
}

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_tasd = {}

args = SimpleNamespace(**args)

N_SAMPLES = 1000

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting])].sort_values(by = ['f1-micro'], ascending = False)
    
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
        
    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['basic', 'context']
    else:
        prompts = ['basic', 'context', 'cot']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,12]
            f1_prompts[prompt] = f1
        except:
            pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)

    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_prompts[col1]),
            'std 1': np.std(df_prompts[col1]),
            'mean 2': np.mean(df_prompts[col2]),
            'std 2': np.std(df_prompts[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df

In [27]:
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e' or cond_parameters[0] == 'e2e-e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

args.results = results_all

## ACD

### Full Dataset

In [28]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

Unnamed: 0,basic,context
1,0.8299,0.8497
2,0.8694,0.8407
3,0.8243,0.8493
4,0.8509,0.8606
5,0.8459,0.8603


Unnamed: 0,W,pval,normal
basic,0.956999,0.786948,True


Unnamed: 0,W,pval,normal
context,0.891068,0.362504,True


   split   prompt      f1
0      1    basic  0.8299
1      2    basic  0.8694
2      3    basic  0.8243
3      4    basic  0.8509
4      5    basic  0.8459
5      1  context  0.8497
6      2  context  0.8407
7      3  context  0.8493
8      4  context  0.8606
9      5  context  0.8603
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2  eps
0  prompt      1      4  0.710749  0.446657  0.093526  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.84408,0.01602,0.85212,0.007524,-0.843059,0.446657,0.446657,False


### 1000

In [29]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

Unnamed: 0,basic,context
1,0.8578,0.6897
2,0.8453,0.8212
3,0.8031,0.8113
4,0.7668,0.8498
5,0.8,0.8152


Unnamed: 0,W,pval,normal
basic,0.937715,0.649833,True


Unnamed: 0,W,pval,normal
context,0.767566,0.042937,False


   split   prompt      f1
0      1    basic  0.8578
1      2    basic  0.8453
2      3    basic  0.8031
3      4    basic  0.7668
4      5    basic  0.8000
5      1  context  0.6897
6      2  context  0.8212
7      3  context  0.8113
8      4  context  0.8498
9      5  context  0.8152
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.04      1  0.2  0.654721


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,0.8146,0.032982,0.79744,0.055537,7.0,1.0,1.0,False


### 500

In [30]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

Unnamed: 0,basic,context
1,0.7657,0.7965
2,0.8287,0.784
3,0.8143,0.7809
4,0.8493,0.761
5,0.7995,0.8134


Unnamed: 0,W,pval,normal
basic,0.986424,0.965739,True


Unnamed: 0,W,pval,normal
context,0.986798,0.967326,True


   split   prompt      f1
0      1    basic  0.7657
1      2    basic  0.8287
2      3    basic  0.8143
3      4    basic  0.8493
4      5    basic  0.7995
5      1  context  0.7965
6      2  context  0.7840
7      3  context  0.7809
8      4  context  0.7610
9      5  context  0.8134
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2  eps
0  prompt      1      4  1.303458  0.317287  0.212661  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.8115,0.028193,0.78716,0.017374,1.141691,0.317287,0.317287,False


## ACSA

### Full Dataset

In [31]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

Unnamed: 0,basic,context,cot
1,0.8038,0.8407,0.7755
2,0.8266,0.8242,0.8108
3,0.7688,0.7827,0.7809
4,0.8351,0.8237,0.8065
5,0.8015,0.7638,0.7965


Unnamed: 0,W,pval,normal
basic,0.939782,0.664398,True


Unnamed: 0,W,pval,normal
context,0.896665,0.391727,True


Unnamed: 0,W,pval,normal
cot,0.916515,0.507713,True


    split   prompt      f1
0       1    basic  0.8038
1       2    basic  0.8266
2       3    basic  0.7688
3       4    basic  0.8351
4       5    basic  0.8015
5       1  context  0.8407
6       2  context  0.8242
7       3  context  0.7827
8       4  context  0.8237
9       5  context  0.7638
10      1      cot  0.7755
11      2      cot  0.8108
12      3      cot  0.7809
13      4      cot  0.8065
14      5      cot  0.7965
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  0.736704  0.508552  0.067854  0.666385


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.80716,0.023123,0.80702,0.028869,0.011224,0.991582,0.991582,False
1,t-test,basic vs cot,0.80716,0.023123,0.79404,0.013846,1.708638,0.1627,0.4881,False
2,t-test,context vs cot,0.80702,0.028869,0.79404,0.013846,0.824516,0.455974,0.911948,False


### 1000

In [32]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

Unnamed: 0,basic,context,cot
1,0.8005,0.7934,0.799
2,0.8162,0.8026,0.8162
3,0.8015,0.799,0.7729
4,0.8009,0.8029,0.8126
5,0.747,0.8111,0.7955


Unnamed: 0,W,pval,normal
basic,0.751384,0.030653,False


Unnamed: 0,W,pval,normal
context,0.96814,0.863187,True


Unnamed: 0,W,pval,normal
cot,0.922351,0.545216,True


    split   prompt      f1
0       1    basic  0.8005
1       2    basic  0.8162
2       3    basic  0.8015
3       4    basic  0.8009
4       5    basic  0.7470
5       1  context  0.7934
6       2  context  0.8026
7       3  context  0.7990
8       4  context  0.8029
9       5  context  0.8111
10      1      cot  0.7990
11      2      cot  0.8162
12      3      cot  0.7729
13      4      cot  0.8126
14      5      cot  0.7955
Friedman Test Result:
          Source         W  ddof1         Q    p-unc
Friedman  Within  0.031579      2  0.315789  0.85394


  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,0.79322,0.023853,0.8018,0.005775,6.0,0.8125,1.0,False
1,wilcoxon,basic vs cot,0.79322,0.023853,0.79924,0.015323,4.0,0.855132,1.0,False
2,wilcoxon,context vs cot,0.8018,0.005775,0.79924,0.015323,6.0,0.8125,1.0,False


### 500

In [33]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

Unnamed: 0,basic,context,cot
1,0.7713,0.7619,0.731
2,0.7537,0.7784,0.747
3,0.7572,0.7551,0.7782
4,0.8155,0.759,0.7588
5,0.7837,0.7807,0.7474


Unnamed: 0,W,pval,normal
basic,0.904294,0.434072,True


Unnamed: 0,W,pval,normal
context,0.857079,0.217951,True


Unnamed: 0,W,pval,normal
cot,0.960821,0.813691,True


    split   prompt      f1
0       1    basic  0.7713
1       2    basic  0.7537
2       3    basic  0.7572
3       4    basic  0.8155
4       5    basic  0.7837
5       1  context  0.7619
6       2  context  0.7784
7       3  context  0.7551
8       4  context  0.7590
9       5  context  0.7807
10      1      cot  0.7310
11      2      cot  0.7470
12      3      cot  0.7782
13      4      cot  0.7588
14      5      cot  0.7474
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  1.760295  0.23252  0.252547  0.947598


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.77628,0.022325,0.76702,0.010481,0.702466,0.521116,0.530819,False
1,t-test,basic vs cot,0.77628,0.022325,0.75248,0.015612,1.724285,0.159747,0.479241,False
2,t-test,context vs cot,0.76702,0.010481,0.75248,0.015612,1.293722,0.265409,0.530819,False


## E2E

In [34]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

Unnamed: 0,basic,context,cot
1,0.7434,0.7594,0.7633
2,0.7986,0.8103,0.7761
3,0.8067,0.785,0.7807
4,0.837,0.8235,0.8103
5,0.774,0.8227,0.7779


Unnamed: 0,W,pval,normal
basic,0.988693,0.974917,True


Unnamed: 0,W,pval,normal
context,0.87736,0.297517,True


Unnamed: 0,W,pval,normal
cot,0.86961,0.26485,True


    split   prompt      f1
0       1    basic  0.7434
1       2    basic  0.7986
2       3    basic  0.8067
3       4    basic  0.8370
4       5    basic  0.7740
5       1  context  0.7594
6       2  context  0.8103
7       3  context  0.7850
8       4  context  0.8235
9       5  context  0.8227
10      1      cot  0.7633
11      2      cot  0.7761
12      3      cot  0.7807
13      4      cot  0.8103
14      5      cot  0.7779
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.583135  0.263468  0.085397  0.852423


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.79194,0.03153,0.80018,0.024685,-0.664189,0.542899,0.67372,False
1,t-test,basic vs cot,0.79194,0.03153,0.78166,0.015513,1.0903,0.33686,0.67372,False
2,t-test,context vs cot,0.80018,0.024685,0.78166,0.015513,2.02694,0.112604,0.337813,False


In [35]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,basic,context,cot
1,0.7372,0.7485,0.7055
2,0.7985,0.7527,0.7305
3,0.7769,0.7442,0.7175
4,0.7531,0.7875,0.7612
5,0.7919,0.7951,0.7126


Unnamed: 0,W,pval,normal
basic,0.933575,0.620945,True


Unnamed: 0,W,pval,normal
context,0.824998,0.127529,True


Unnamed: 0,W,pval,normal
cot,0.885572,0.335326,True


    split   prompt      f1
0       1    basic  0.7372
1       2    basic  0.7985
2       3    basic  0.7769
3       4    basic  0.7531
4       5    basic  0.7919
5       1  context  0.7485
6       2  context  0.7527
7       3  context  0.7442
8       4  context  0.7875
9       5  context  0.7951
10      1      cot  0.7055
11      2      cot  0.7305
12      3      cot  0.7175
13      4      cot  0.7612
14      5      cot  0.7126
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  6.433357  0.021604  0.476976  0.879006


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.77152,0.023192,0.7656,0.021292,0.403132,0.707469,0.707469,False
1,t-test,basic vs cot,0.77152,0.023192,0.72546,0.019646,2.941545,0.042324,0.084648,False
2,t-test,context vs cot,0.7656,0.021292,0.72546,0.019646,3.592695,0.022907,0.06872,False


In [36]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,basic,context,cot
1,0.6486,0.6512,0.6322
2,0.7085,0.7546,0.7139
3,0.7415,0.7419,0.5722
4,0.6145,0.5988,0.6933
5,0.7631,0.7359,0.691


Unnamed: 0,W,pval,normal
basic,0.940534,0.669716,True


Unnamed: 0,W,pval,normal
context,0.846621,0.184073,True


Unnamed: 0,W,pval,normal
cot,0.881799,0.317541,True


    split   prompt      f1
0       1    basic  0.6486
1       2    basic  0.7085
2       3    basic  0.7415
3       4    basic  0.6145
4       5    basic  0.7631
5       1  context  0.6512
6       2  context  0.7546
7       3  context  0.7419
8       4  context  0.5988
9       5  context  0.7359
10      1      cot  0.6322
11      2      cot  0.7139
12      3      cot  0.5722
13      4      cot  0.6933
14      5      cot  0.6910
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  0.687341  0.530315  0.080405  0.56805


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.69524,0.055884,0.69648,0.06097,-0.099427,0.925583,1.0,False
1,t-test,basic vs cot,0.69524,0.055884,0.66052,0.051876,0.837892,0.449238,1.0,False
2,t-test,context vs cot,0.69648,0.06097,0.66052,0.051876,0.855994,0.44025,1.0,False


## E2E - Explicit only

In [37]:
args.lr_setting = 0
args.task = 'e2e-e'

computePromptStatistics(args)

Unnamed: 0,basic,context,cot
1,0.8093,0.7757,0.8147
2,0.7654,0.7583,0.7953
3,0.7813,0.7819,0.7974
4,0.8328,0.7728,0.8099
5,0.8046,0.755,0.7763


Unnamed: 0,W,pval,normal
basic,0.976582,0.915563,True


Unnamed: 0,W,pval,normal
context,0.909621,0.465314,True


Unnamed: 0,W,pval,normal
cot,0.941126,0.673908,True


    split   prompt      f1
0       1    basic  0.8093
1       2    basic  0.7654
2       3    basic  0.7813
3       4    basic  0.8328
4       5    basic  0.8046
5       1  context  0.7757
6       2  context  0.7583
7       3  context  0.7819
8       4  context  0.7728
9       5  context  0.7550
10      1      cot  0.8147
11      2      cot  0.7953
12      3      cot  0.7974
13      4      cot  0.8099
14      5      cot  0.7763
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      2      8  6.26392  0.023067  0.418698  0.634063


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.79868,0.023331,0.76874,0.010353,2.549645,0.063329,0.126658,False
1,t-test,basic vs cot,0.79868,0.023331,0.79872,0.01339,-0.003572,0.997321,0.997321,False
2,t-test,context vs cot,0.76874,0.010353,0.79872,0.01339,-6.20857,0.003424,0.010273,True


In [38]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,basic,context,cot
1,0.7372,0.7485,0.7055
2,0.7985,0.7527,0.7305
3,0.7769,0.7442,0.7175
4,0.7531,0.7875,0.7612
5,0.7919,0.7951,0.7126


Unnamed: 0,W,pval,normal
basic,0.933575,0.620945,True


Unnamed: 0,W,pval,normal
context,0.824998,0.127529,True


Unnamed: 0,W,pval,normal
cot,0.885572,0.335326,True


    split   prompt      f1
0       1    basic  0.7372
1       2    basic  0.7985
2       3    basic  0.7769
3       4    basic  0.7531
4       5    basic  0.7919
5       1  context  0.7485
6       2  context  0.7527
7       3  context  0.7442
8       4  context  0.7875
9       5  context  0.7951
10      1      cot  0.7055
11      2      cot  0.7305
12      3      cot  0.7175
13      4      cot  0.7612
14      5      cot  0.7126
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  6.433357  0.021604  0.476976  0.879006


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.77152,0.023192,0.7656,0.021292,0.403132,0.707469,0.707469,False
1,t-test,basic vs cot,0.77152,0.023192,0.72546,0.019646,2.941545,0.042324,0.084648,False
2,t-test,context vs cot,0.7656,0.021292,0.72546,0.019646,3.592695,0.022907,0.06872,False


In [39]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,basic,context,cot
1,0.6486,0.6512,0.6322
2,0.7085,0.7546,0.7139
3,0.7415,0.7419,0.5722
4,0.6145,0.5988,0.6933
5,0.7631,0.7359,0.691


Unnamed: 0,W,pval,normal
basic,0.940534,0.669716,True


Unnamed: 0,W,pval,normal
context,0.846621,0.184073,True


Unnamed: 0,W,pval,normal
cot,0.881799,0.317541,True


    split   prompt      f1
0       1    basic  0.6486
1       2    basic  0.7085
2       3    basic  0.7415
3       4    basic  0.6145
4       5    basic  0.7631
5       1  context  0.6512
6       2  context  0.7546
7       3  context  0.7419
8       4  context  0.5988
9       5  context  0.7359
10      1      cot  0.6322
11      2      cot  0.7139
12      3      cot  0.5722
13      4      cot  0.6933
14      5      cot  0.6910
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  0.687341  0.530315  0.080405  0.56805


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.69524,0.055884,0.69648,0.06097,-0.099427,0.925583,1.0,False
1,t-test,basic vs cot,0.69524,0.055884,0.66052,0.051876,0.837892,0.449238,1.0,False
2,t-test,context vs cot,0.69648,0.06097,0.66052,0.051876,0.855994,0.44025,1.0,False


## TASD

### Full Dataset

In [41]:
args.lr_setting = 0
args.task = 'tasd'

stats_tasd['0'] = computePromptStatistics(args)
stats_tasd['0']

Unnamed: 0,basic,context,cot
1,0.7333,0.7649,0.7035
2,0.7608,0.7132,0.6676
3,0.7503,0.7452,0.7001
4,0.7858,0.7696,0.7481
5,0.7523,0.7561,0.7077


Unnamed: 0,W,pval,normal
basic,0.952113,0.75227,True


Unnamed: 0,W,pval,normal
context,0.883094,0.323566,True


Unnamed: 0,W,pval,normal
cot,0.931031,0.603416,True


    split   prompt      f1
0       1    basic  0.7333
1       2    basic  0.7608
2       3    basic  0.7503
3       4    basic  0.7858
4       5    basic  0.7523
5       1  context  0.7649
6       2  context  0.7132
7       3  context  0.7452
8       4  context  0.7696
9       5  context  0.7561
10      1      cot  0.7035
11      2      cot  0.6676
12      3      cot  0.7001
13      4      cot  0.7481
14      5      cot  0.7077
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  13.974289  0.002453  0.532137  0.692846


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.7565,0.017154,0.7498,0.020102,0.518247,0.631633,0.631633,False
1,t-test,basic vs cot,0.7565,0.017154,0.7054,0.025648,4.618132,0.009895,0.01979,True
2,t-test,context vs cot,0.7498,0.020102,0.7054,0.025648,6.88666,0.00233,0.006991,True


### 1000

In [42]:
args.lr_setting = 1000
args.task = 'tasd'

stats_tasd['1000'] = computePromptStatistics(args)
stats_tasd['1000']

Unnamed: 0,basic,context,cot
1,0.702,0.7148,0.678
2,0.722,0.7295,0.7266
3,0.7677,0.7282,0.6761
4,0.7441,0.7821,0.6978
5,0.7217,0.6927,0.7037


Unnamed: 0,W,pval,normal
basic,0.953457,0.761849,True


Unnamed: 0,W,pval,normal
context,0.913809,0.490817,True


Unnamed: 0,W,pval,normal
cot,0.920545,0.533459,True


    split   prompt      f1
0       1    basic  0.7020
1       2    basic  0.7220
2       3    basic  0.7677
3       4    basic  0.7441
4       5    basic  0.7217
5       1  context  0.7148
6       2  context  0.7295
7       3  context  0.7282
8       4  context  0.7821
9       5  context  0.6927
10      1      cot  0.6780
11      2      cot  0.7266
12      3      cot  0.6761
13      4      cot  0.6978
14      5      cot  0.7037
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.056099  0.103272  0.310938  0.957234


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.7315,0.022475,0.72946,0.029458,0.143453,0.892869,0.892869,False
1,t-test,basic vs cot,0.7315,0.022475,0.69644,0.018536,2.151813,0.097783,0.293348,False
2,t-test,context vs cot,0.72946,0.029458,0.69644,0.018536,1.930211,0.125785,0.293348,False


### 500

In [43]:
args.lr_setting = 500
args.task = 'tasd'

stats_tasd['500'] = computePromptStatistics(args)
stats_tasd['500']

Unnamed: 0,basic,context,cot
1,0.6947,0.6674,0.6002
2,0.7293,0.7377,0.614
3,0.6923,0.6979,0.6154
4,0.7307,0.7565,0.6863
5,0.7323,0.6623,0.6182


Unnamed: 0,W,pval,normal
basic,0.743326,0.025785,False


Unnamed: 0,W,pval,normal
context,0.903737,0.430882,True


Unnamed: 0,W,pval,normal
cot,0.734861,0.021421,False


    split   prompt      f1
0       1    basic  0.6947
1       2    basic  0.7293
2       3    basic  0.6923
3       4    basic  0.7307
4       5    basic  0.7323
5       1  context  0.6674
6       2  context  0.7377
7       3  context  0.6979
8       4  context  0.7565
9       5  context  0.6623
10      1      cot  0.6002
11      2      cot  0.6140
12      3      cot  0.6154
13      4      cot  0.6863
14      5      cot  0.6182
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.76      2  7.6  0.022371


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,0.71586,0.018297,0.70436,0.037435,6.0,0.8125,0.8125,False
1,wilcoxon,basic vs cot,0.71586,0.018297,0.62682,0.030383,0.0,0.0625,0.1875,False
2,wilcoxon,context vs cot,0.70436,0.037435,0.62682,0.030383,0.0,0.0625,0.1875,False


In [44]:
import pandas as pd

def extract_means_and_stds(stats):
    """ 
    Extract the mean and std values for two methods from the statistical dataframe.
    """
    # Initialize a dictionary to store extracted values
    results = {
        "acd": {},
        "acsa": {},
        "e2e": {},
        "tasd": {}
    }
    for task, dfs in stats.items():
        for lr_setting, df in dfs.items():
            if df is not None:
                results[task][lr_setting] = {}
                # Find the row in the dataframe that corresponds to the comparison between method1 and method2
                for _, row in df.iterrows():
                    comparison = row['comparison']
                    mean1, mean2 = row['mean 1'], row['mean 2']
                    std1, std2 = row['std 1'], row['std 2']
            
                    # Map the means and stds to the correct methods
                    method1, method2 = comparison.split(' vs ')
                    
                    if method1 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method1] = {'mean': None, 'std': None}
                        results[task][lr_setting][method1]['mean'], results[task][lr_setting][method1]['std'] = mean1*100, std1*100
                        
                    if method2 not in results[task][lr_setting].keys():
                        results[task][lr_setting][method2] = {'mean': None, 'std': None} 
                        results[task][lr_setting][method2]['mean'], results[task][lr_setting][method2]['std'] = mean2*100, std2*100
    
    return results

def create_full_latex_row(task_results, resource_setting):
    """
    Creates a full LaTeX row for a specific resource setting across all tasks.
    
    Parameters:
    - task_results: A dictionary containing results_dicts for all tasks (e.g., {'ACD': results_dict1, 'ACSA': results_dict2, 'tasd': results_dict3}).
    - resource_setting: The resource setting (e.g., 'Full', '1000', '500').
    
    Returns:
    - A LaTeX formatted string representing a full row of the table.
    """

    rs_text = resource_setting if resource_setting != '0' else 'Full'
    latex_row = r"\multicolumn{1}{r|}{" + rs_text + "} & "

    for task, results_dict in task_results.items():
        prompt_styles = ['basic', 'context'] if task == 'acd' else ['basic', 'context', 'cot']
        if resource_setting in results_dict.keys():
            prompts = results_dict[resource_setting]
            # Find the method with the highest mean value among short, long, cot
            highest_prompt = max(['basic', 'context'], key=lambda x: prompts[x]['mean']) if task == 'acd' else max(['basic', 'context', 'cot'], key=lambda x: prompts[x]['mean'])
            # Initialize LaTeX row string
            if prompt_styles[0] in prompts.keys():
                if prompt_styles[0] == highest_prompt:
                    latex_row += (
                        r"\textbf{" + f"{prompts[prompt_styles[0]]['mean']:.2f}" +
                        r"}\textsubscript{ \textit{" + f"{prompts[prompt_styles[0]]['std']:.2f}" + "}} & "
                    )
                else:
                    latex_row += (
                        r"" + f"{prompts[prompt_styles[0]]['mean']:.2f}" +
                        r"\textsubscript{ \textit{" + f"{prompts[prompt_styles[0]]['std']:.2f}" + "}} & "
                    )
            else:
                latex_row += r"\multicolumn{1}{l|}{N/A} & "
            
            # Add the remaining methods
            for i, prompt_style in enumerate(prompt_styles[1:]):
                
                if prompt_style in prompts.keys():
                    if prompt_style == highest_prompt:
                        latex_row += (
                            r"\multicolumn{1}{l" + f"{'|' if((i == 1 and task != 'tasd') or (i == 0 and task == 'acd')) else ''}" +
                            r"}{\textbf{" + f"{prompts[prompt_style]['mean']:.2f}" +
                            r"}\textsubscript{ \textit{" + f"{prompts[prompt_style]['std']:.2f}" + "}}} & "
                        )
                    else:
                        latex_row += (
                            r"\multicolumn{1}{l" + f"{'|' if((i == 1 and task != 'tasd') or (i == 0 and task == 'acd')) else ''}" +
                            r"}{" + f"{prompts[prompt_style]['mean']:.2f}" +
                            r"\textsubscript{ \textit{" + f"{prompts[prompt_style]['std']:.2f}" + "}}} & "
                        )
                else:
                    latex_row += r"\multicolumn{1}{l|}{N/A} & "
            
    # Remove the trailing '&' and replace with '\\'
    latex_row = latex_row.rstrip(" & ") + r" \\"
    
    return latex_row
    
results_dict = extract_means_and_stds({'acd':stats_acd, 'acsa':stats_acsa, 'e2e':stats_e2e, 'tasd':stats_tasd})

latex = []
latex.append(create_full_latex_row(results_dict, '0'))
latex.append(create_full_latex_row(results_dict, '1000'))
latex.append(create_full_latex_row(results_dict, '500'))

for l in latex:
    print(l)
    print("&")


\multicolumn{1}{r|}{Full} & 84.41\textsubscript{ \textit{1.60}} & \multicolumn{1}{l|}{\textbf{85.21}\textsubscript{ \textit{0.75}}} & \textbf{80.72}\textsubscript{ \textit{2.31}} & \multicolumn{1}{l}{80.70\textsubscript{ \textit{2.89}}} & \multicolumn{1}{l|}{79.40\textsubscript{ \textit{1.38}}} & 79.19\textsubscript{ \textit{3.15}} & \multicolumn{1}{l}{\textbf{80.02}\textsubscript{ \textit{2.47}}} & \multicolumn{1}{l|}{78.17\textsubscript{ \textit{1.55}}} & \textbf{75.65}\textsubscript{ \textit{1.72}} & \multicolumn{1}{l}{74.98\textsubscript{ \textit{2.01}}} & \multicolumn{1}{l}{70.54\textsubscript{ \textit{2.56}}} \\
&
\multicolumn{1}{r|}{1000} & \textbf{81.46}\textsubscript{ \textit{3.30}} & \multicolumn{1}{l|}{79.74\textsubscript{ \textit{5.55}}} & 79.32\textsubscript{ \textit{2.39}} & \multicolumn{1}{l}{\textbf{80.18}\textsubscript{ \textit{0.58}}} & \multicolumn{1}{l|}{79.92\textsubscript{ \textit{1.53}}} & \textbf{77.15}\textsubscript{ \textit{2.32}} & \multicolumn{1}{l}{76.56\te