In [1]:
import pandas as pd

In [2]:
def compute_weighted_averages(df, split_col='Split'):
    # Compute the total size
    total_size = df['Size'].sum()
    
    # Calculate the weighted averages for each metric
    weighted_averages = {}
    for col in df.columns[1:-1]:  # Exclude 'Split' and 'Size' columns
        weighted_averages[col] = (df[col] * df['Size']).sum() / total_size
    
    # Create a new row for the 'total' split with the weighted averages
    total_row = pd.DataFrame({split_col: ['total'], **weighted_averages, 'Size': [total_size]})
    
    # Append the total row to the dataframe using concat
    df_total = pd.concat([df, total_row], ignore_index=True)
    
    return df_total

## Spelling Correction Task

In [3]:
path = 'eval_results/character_tasks/spelling_correction_contextual/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_spelling_correction_contextual_seed83'))
byt5 = compute_weighted_averages(byt5)

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_spelling_correction_contextual_seed79'))
mrt5 = compute_weighted_averages(mrt5)

# BPT5 DF (50%)
bpt5_50 = pd.read_csv(path.format('BPT5', 'bpt5_spelling_correction_contextual_prior50%_seed960'))
bpt5_50 = compute_weighted_averages(bpt5_50)

# BPT5 DF (70%)
bpt5_70 = pd.read_csv(path.format('BPT5', 'bpt5_spelling_correction_contextual_prior70%_seed960'))
bpt5_70 = compute_weighted_averages(bpt5_70)

In [4]:
byt5

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,37.627714,0.0,2.38637,4163
1,test_context_independent,76.452153,0.0,2.370906,4685
2,total,58.185184,0.0,2.378182,8848


In [5]:
mrt5

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,35.049489,78.768826,1.457101,4163
1,test_context_independent,74.755513,78.975504,1.392135,4685
2,total,56.073757,78.878262,1.422702,8848


In [6]:
bpt5_50

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,34.921775,50.24333,1.840285,4163
1,test_context_independent,73.295156,50.200927,1.782003,4685
2,total,55.240411,50.220878,1.809424,8848


In [7]:
bpt5_70

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,33.205619,70.193018,2.462491,4163
1,test_context_independent,71.290037,70.252788,1.531087,4685
2,total,53.37125,70.224666,1.969314,8848


In [8]:
(mrt5['Eval Runtime'] - byt5['Eval Runtime']) / byt5['Eval Runtime']

0   -0.389407
1   -0.412826
2   -0.401769
Name: Eval Runtime, dtype: float64

In [9]:
(bpt5['Eval Runtime'] - byt5['Eval Runtime']) / byt5['Eval Runtime']

NameError: name 'bpt5' is not defined

## Word Search Task

In [None]:
path = 'eval_results/character_tasks/word_search/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_word_search_seed82'))
byt5 = compute_weighted_averages(byt5)

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_word_search_seed839'))
mrt5 = compute_weighted_averages(mrt5)

# BPT5 DF (30%)
bpt5_30 = pd.read_csv(path.format('BPT5', 'bpt5_word_search_prior30%_seed45'))
bpt5_30 = compute_weighted_averages(bpt5_30)

# BPT5 DF (30%)
bpt5_70 = pd.read_csv(path.format('BPT5', 'bpt5_word_search_prior70%_seed45'))
bpt5_70 = compute_weighted_averages(bpt5_70)

In [None]:
byt5

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,78.052326,0.0,5.046972,1367
1,test_paraphrase,83.665835,0.0,4.72987,6404
2,test_overlap,77.580409,0.0,4.762386,5466
3,test_paraphrase_overlap,58.455882,0.0,4.776801,4068
4,total,75.373972,0.0,4.776222,17305


In [None]:
mrt5

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,79.993771,72.515522,2.562451,1367
1,test_paraphrase,82.060474,72.270892,2.190388,6404
2,test_overlap,76.217105,76.14381,2.214819,5466
3,test_paraphrase_overlap,57.598039,73.95191,2.250237,4068
4,total,74.30097,73.908694,2.241565,17305


In [None]:
bpt5_30

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,59.696844,20.751872,3.618925,1367
1,test_paraphrase,19.155237,21.50471,3.339697,6404
2,test_overlap,74.093567,22.469333,3.4957,5466
3,test_paraphrase_overlap,14.754902,21.533969,3.640229,4068
4,total,38.676342,21.756806,3.481678,17305


In [None]:
bpt5_70

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,38.818522,78.365977,2.064622,1367
1,test_paraphrase,3.678304,78.315442,1.747434,6404
2,test_overlap,54.707602,75.835703,1.934376,5466
3,test_paraphrase_overlap,3.137255,78.19987,1.892742,4068
4,total,22.44524,77.509009,1.865697,17305


In [None]:
(mrt5['Eval Runtime'] - byt5['Eval Runtime']) / byt5['Eval Runtime']

0   -0.492280
1   -0.536903
2   -0.534935
3   -0.528924
4   -0.530682
Name: Eval Runtime, dtype: float64

## XNLI

In [None]:
xnli_size = 5010

In [None]:
path = 'eval_results/xnli/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_xnli_seed98'))
byt5 = byt5.drop(columns='Language Code')
byt5 = compute_weighted_averages(byt5, split_col='Language')

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_xnli_seed73'))
mrt5 = mrt5.drop(columns='Language Code')
mrt5 = compute_weighted_averages(mrt5, split_col='Language')

# BPT5
bpt5 = pd.read_csv(path.format('BPT5', 'bpt5_xnli_prior50%_seed73'))
bpt5 = bpt5.drop(columns='Language Code')
bpt5 = compute_weighted_averages(bpt5, split_col='Language')

In [None]:
byt5

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,76.47293,0.0,8.902678,5010
1,French,53.901274,0.0,10.701394,5010
2,Spanish,55.991242,0.0,10.107771,5010
3,German,45.680732,0.0,10.416175,5010
4,Greek,52.01035,0.0,19.023875,5010
5,Bulgarian,56.309713,0.0,17.53938,5010
6,Russian,55.354299,0.0,17.965874,5010
7,Turkish,44.86465,0.0,9.710111,5010
8,Arabic,51.890924,0.0,14.064874,5010
9,Vietnamese,48.626592,0.0,12.564745,5010


In [None]:
mrt5

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,78.841561,52.561442,9.1014,5010
1,French,51.890924,47.57979,11.636156,5010
2,Spanish,53.960987,50.670281,10.911014,5010
3,German,45.93949,47.605549,11.356815,5010
4,Greek,52.746815,54.794956,21.697811,5010
5,Bulgarian,53.622611,47.816622,20.095207,5010
6,Russian,55.971338,47.947454,20.719825,5010
7,Turkish,42.914013,46.717308,10.777101,5010
8,Arabic,47.133758,43.781649,16.027241,5010
9,Vietnamese,47.511943,46.50611,14.103683,5010


In [None]:
bpt5

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,71.476911,52.022313,4.565122,5010
1,French,47.173567,52.890644,5.474971,5010
2,Spanish,51.174363,53.161599,5.311914,5010
3,German,41.878981,52.797613,5.618026,5010
4,Greek,43.789809,55.633176,9.559969,5010
5,Bulgarian,46.118631,47.812433,9.817064,5010
6,Russian,43.610669,48.568253,10.008446,5010
7,Turkish,40.684713,51.864318,5.444286,5010
8,Arabic,42.257166,56.649906,6.973733,5010
9,Vietnamese,46.078822,51.92577,6.703267,5010


In [None]:
(mrt5['Eval Runtime'] - byt5['Eval Runtime']) / byt5['Eval Runtime'] * -100

0     -2.232160
1     -8.734952
2     -7.946784
3     -9.030566
4    -14.055683
5    -14.571937
6    -15.328788
7    -10.988437
8    -13.952260
9    -12.248069
10   -12.834763
11    -6.762990
12   -12.676581
13    -9.130555
14   -13.773388
15   -11.853187
Name: Eval Runtime, dtype: float64