In [7]:
import os, sys
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [8]:
# take the noted finetuned eval data
df_wwmlm_1e5 = pd.DataFrame({'task_name':['absa-airy', 'emotion-twitter', 'doc-sentiment-prosa', 'keyword-extraction-prosa', 'ner-grit', 'pos-idn', 'term-extraction-airy', 'entailment-ui', 'pos-prosa', 'ner-prosa'],
                              'F1':[0.7085, 0.7344, 0.8892, 0.6221, 0.7385, 0.9478, 0.8911, 0.7711, 0.964415, 0.789788]})
df_wwmlm_1e5['exp_name'] = 'babert_model_bpe_wwmlm_b8_step1_gamma0.8_lr1e-5_early10'
df_wwmlm_1e5['stats'] = 'mean'

df_wwmlm_625e6 = pd.DataFrame({'task_name':['absa-airy', 'emotion-twitter', 'doc-sentiment-prosa', 'keyword-extraction-prosa', 'ner-grit', 'pos-idn', 'term-extraction-airy', 'entailment-ui', 'pos-prosa', 'ner-prosa'],
                              'F1':[0.6781, 0.7233, 0.8602, 0.6447, 0.7483, 0.9602, 0.9039, 0.8219, 0.959804, 0.800779]})
df_wwmlm_625e6['exp_name'] = 'babert_model_bpe_wwmlm_b8_step1_gamma0.8_lr6.25e-6_early10'
df_wwmlm_625e6['stats'] = 'mean'

xlmr_base = pd.DataFrame({'task_name':['absa-airy', 'emotion-twitter', 'doc-sentiment-prosa', 'keyword-extraction-prosa', 'ner-grit', 'ner-prosa', 'pos-prosa', 'pos-idn', 'term-extraction-airy', 'entailment-ui'],
                            'F1':[0.7126, 0.7251, 0.9229, 0.6206, 0.8011, 0.7967, 0.9667, 0.9672, 0.8984, 0.8278]})
xlmr_base['exp_name'] = 'xlmr_base'
xlmr_base['stats'] = 'mean'

babert_bpe_mlm = pd.DataFrame({'task_name':['absa-airy', 'emotion-twitter', 'doc-sentiment-prosa', 'keyword-extraction-prosa', 'ner-grit', 'ner-prosa', 'pos-prosa', 'pos-idn', 'term-extraction-airy', 'entailment-ui'],
                                'F1':[0.7287, 0.7559, 0.8981, 0.6558, 0.7582, 0.8180, 0.9692, 0.9624, 0.9020, 0.8325]})
babert_bpe_mlm['exp_name'] = 'babert_bpe_mlm'
babert_bpe_mlm['stats'] = 'mean'

xlm_mlm_large = pd.DataFrame({'task_name':['absa-airy', 'emotion-twitter', 'doc-sentiment-prosa', 'keyword-extraction-prosa', 'ner-grit', 'ner-prosa', 'pos-prosa', 'pos-idn', 'term-extraction-airy', 'entailment-ui'],
                                'F1':[0.5849, 0.6546, 0.8567, 0.5703, 0.8138, 0.8169, 0.9677, 0.9685, 0, 0.6500]})
xlm_mlm_large['exp_name'] = 'xlm_mlm_large'
xlm_mlm_large['stats'] = 'mean'

xlmr_large = pd.DataFrame({'task_name':['absa-airy', 'emotion-twitter', 'doc-sentiment-prosa', 'keyword-extraction-prosa', 'ner-grit', 'ner-prosa', 'pos-prosa', 'pos-idn', 'term-extraction-airy', 'entailment-ui'],
                                'F1':[0.7500, 0.7677, 0.9335, 0.6092, 0.7939, 0, 0.0794, 0.9689, 0.9087, 0.8728]})
xlmr_large['exp_name'] = 'xlmr_large'
xlmr_large['stats'] = 'mean'

# take the recently finetuned eval data
base_path = './save'
dfs = []
for task_path in os.listdir(base_path):
    for exp_path in os.listdir(f'{base_path}/{task_path}'):
        if os.path.exists(f'{base_path}/{task_path}/{exp_path}/evaluation_result.csv'):
            df = pd.read_csv(f'{base_path}/{task_path}/{exp_path}/evaluation_result.csv')
            df.columns = ['stats'] + df.columns[1:].tolist()
            df['task_name'] = task_path
            df['exp_name'] = exp_path
            dfs.append(df.loc[df['stats'] == 'mean',:])
combined_df = pd.concat(dfs + [df_wwmlm_1e5, df_wwmlm_625e6, xlmr_base, babert_bpe_mlm, xlm_mlm_large, xlmr_large], sort=False)
combined_df.reset_index(drop=True, inplace=True)

for metric in ['ACC', 'F1', 'REC', 'PRE']:
    combined_df[metric] = combined_df[metric].apply(lambda x: x / 100 if x > 1 else x)

# Each Task's Standings vs NoN Large Models

In [9]:
analyzed_df = combined_df[~combined_df['exp_name'].isin(['xlmr_large', 'xlm_mlm_large'])]
maxes = analyzed_df.groupby('task_name')['F1'].idxmax()
combined_df_maxes = analyzed_df.loc[maxes][['exp_name', 'task_name', 'F1']]
undefeated_list = ['doc-sentiment-prosa', 'pos-idn', 'ner-grit']
combined_df_maxes[combined_df_maxes['task_name'].isin(undefeated_list)]

Unnamed: 0,exp_name,task_name,F1
213,xlmr_base,doc-sentiment-prosa,0.9229
215,xlmr_base,ner-grit,0.8011
218,xlmr_base,pos-idn,0.9672


In [10]:
combined_df_maxes[~combined_df_maxes['task_name'].isin(undefeated_list)]

Unnamed: 0,exp_name,task_name,F1
121,babert_model_word_mlm_b8_step1_gamma0.8_lr1e-5_early10,absa-airy,0.731188
222,babert_bpe_mlm,emotion-twitter,0.7559
230,babert_bpe_mlm,entailment-ui,0.8325
224,babert_bpe_mlm,keyword-extraction-prosa,0.6558
163,babert_model_bpe_wwmlm_128_256_ckpt_1000000_b8_step1_gamma0.8_lr1e-5_early5,ner-prosa,0.831198
227,babert_bpe_mlm,pos-prosa,0.9692
21,babert_model_bpe_wwmlm_ckpt_2000000_b8_step1_gamma0.8_lr1e-5_early10,qa-factoid-itb,0.442857
57,babert_model_bpe_wwmlm_128_256_ckpt_1000000_b8_step1_gamma0.8_lr1e-5_early5,term-extraction-airy,0.905271


If versus large model, we haven't yet defeated:

- __absa-airy__: __75__
- __doc-sentiment-prosa__: __93.35 or 92.29__
- __emotion-twitter__: __76.77__ (BASEnya kita udah menang)
- __entailment-ui__: __87.28__ (BASEnya kita udah menang) -> cepet finetuningnya
- __ner-grit__: __81.38__
- __pos-idn__: __96.89 or 96.72__
- __term-extraction-airy__: __90.87__ (BASEnya kita 90.20)

# IndoNLU Eval

In [11]:
indonlu_eval = combined_df[~combined_df['task_name'].isin(['qa-factoid-itb'])]
indonlu_eval_grouped = indonlu_eval.groupby('exp_name').agg(['mean', 'count'])
indonlu_eval_grouped[indonlu_eval_grouped[('F1','count')]==10][['F1']].reset_index().sort_values(by=('F1','mean'), ascending=False).reset_index(drop=True)

Unnamed: 0_level_0,exp_name,F1,F1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
0,babert_bpe_mlm,0.82808,10
1,xlmr_base,0.82391,10
2,babert_model_bpe_wwmlm_128_256_ckpt_1000000_b8_step1_gamma0.8_lr1e-5_early10,0.818826,10
3,babert_model_bpe_wwmlm_128_256_ckpt_1000000_b8_step1_gamma0.8_lr1e-5_early5,0.816919,10
4,babert_model_bpe_wwmlm_128_256_ckpt_1000000_b8_step1_gamma0.8_lr6.25e-6_early10,0.813624,10
5,babert_model_bpe_wwmlm_ckpt_2000000_b8_step1_gamma0.8_lr1e-5_early10,0.81192,10
6,babert_model_bpe_wwmlm_ckpt_2000000_b8_step1_gamma0.8_lr1e-5_early5,0.811793,10
7,babert_model_bpe_wwmlm_b8_step1_gamma0.8_lr6.25e-6_early10,0.810118,10
8,babert_model_bpe_wwmlm_b8_step1_gamma0.8_lr1e-5_early10,0.80569,10
9,babert_model_word_mlm_b8_step1_gamma0.8_lr1e-5_early5,0.796764,10


In [12]:
indonlu_eval = combined_df[~combined_df['task_name'].isin(['qa-factoid-itb', 'term-extraction-airy', 'pos-prosa'])]
indonlu_eval_grouped = indonlu_eval.groupby('exp_name').agg(['mean', 'count'])
indonlu_eval_grouped[indonlu_eval_grouped[('F1','count')]==8][['F1']].reset_index().sort_values(by=('F1','mean'), ascending=False)

Unnamed: 0_level_0,exp_name,F1,F1
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
0,babert_bpe_mlm,0.8012,8
21,xlmr_base,0.79675,8
4,babert_model_bpe_wwmlm_128_256_ckpt_1000000_b8_step1_gamma0.8_lr1e-5_early10,0.789828,8
5,babert_model_bpe_wwmlm_128_256_ckpt_1000000_b8_step1_gamma0.8_lr1e-5_early5,0.787445,8
6,babert_model_bpe_wwmlm_128_256_ckpt_1000000_b8_step1_gamma0.8_lr6.25e-6_early10,0.784566,8
9,babert_model_bpe_wwmlm_ckpt_2000000_b8_step1_gamma0.8_lr1e-5_early10,0.781867,8
10,babert_model_bpe_wwmlm_ckpt_2000000_b8_step1_gamma0.8_lr1e-5_early5,0.781867,8
8,babert_model_bpe_wwmlm_b8_step1_gamma0.8_lr6.25e-6_early10,0.779685,8
7,babert_model_bpe_wwmlm_b8_step1_gamma0.8_lr1e-5_early10,0.775173,8
19,babert_model_word_mlm_b8_step1_gamma0.8_lr1e-5_early5,0.766666,8


In [None]:
sorted_model_list = [
    'scratch',
    'word2vec_',
    'fasttext_',
    'fasttext-cc-id',
    'fasttext-cc-id-no-oov',
    'babert-opensubtitle',
    'babert-opensubtitle',
    'bert-base-multilingual-uncased',
    'xlm-roberta-base'
]

In [None]:
def exp_to_index(exp_name):
    for i, model in enumerate(sorted_model_list):
        if model in exp_name:
            return i
combined_df['model_index'] = combined_df['exp_name'].apply(lambda x: exp_to_index(x))

In [None]:
combined_df = combined_df.sort_values(['task_name', 'model_index'])

In [None]:
combined_df.to_csv('aggregated_result.csv', index=False)