In [1]:
import pandas as pd

In [2]:
def compute_weighted_averages(df, split_col='Split'):
    # Compute the total size
    total_size = df['Size'].sum()
    
    # Calculate the weighted averages for each metric
    weighted_averages = {}
    for col in df.columns[1:-1]:  # Exclude 'Split' and 'Size' columns
        weighted_averages[col] = (df[col] * df['Size']).sum() / total_size
    
    # Create a new row for the 'total' split with the weighted averages
    total_row = pd.DataFrame({split_col: ['total'], **weighted_averages, 'Size': [total_size]})
    
    # Append the total row to the dataframe using concat
    df_total = pd.concat([df, total_row], ignore_index=True)
    
    return df_total

## Spelling Correction Task

In [3]:
path = 'eval_results/character_tasks/spelling_correction_contextual/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_spelling_correction_contextual_seed83'))
byt5 = compute_weighted_averages(byt5)

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_spelling_correction_contextual_seed79'))
mrt5 = compute_weighted_averages(mrt5)

In [4]:
byt5

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,37.627714,0.0,3.289182,4163
1,test_context_independent,76.452153,0.0,3.203664,4685
2,total,58.185184,0.0,3.2439,8848


In [5]:
mrt5

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,35.049489,78.768826,2.195329,4163
1,test_context_independent,74.755513,78.975504,2.093651,4685
2,total,56.073757,78.878262,2.141491,8848


In [6]:
(mrt5['Eval Runtime'] - byt5['Eval Runtime']) / byt5['Eval Runtime']

0   -0.332561
1   -0.346482
2   -0.339841
Name: Eval Runtime, dtype: float64

## Word Search Task

In [7]:
path = 'eval_results/character_tasks/word_search/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_word_search_seed82'))
byt5 = compute_weighted_averages(byt5)

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_word_search_seed839'))
mrt5 = compute_weighted_averages(mrt5)

In [8]:
byt5

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,78.052326,0.0,5.046972,1367
1,test_paraphrase,83.665835,0.0,4.72987,6404
2,test_overlap,77.580409,0.0,4.762386,5466
3,test_paraphrase_overlap,58.455882,0.0,4.776801,4068
4,total,75.373972,0.0,4.776222,17305


In [9]:
mrt5

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,79.993771,72.515522,2.515341,1367
1,test_paraphrase,82.060474,72.270892,2.168822,6404
2,test_overlap,76.217105,76.14381,2.185787,5466
3,test_paraphrase_overlap,57.598039,73.95191,2.193259,4068
4,total,74.30097,73.908694,2.207298,17305


In [10]:
(mrt5['Eval Runtime'] - byt5['Eval Runtime']) / byt5['Eval Runtime']

0   -0.501614
1   -0.541463
2   -0.541031
3   -0.540852
4   -0.537857
Name: Eval Runtime, dtype: float64

## XNLI

In [11]:
xnli_size = 5010

In [12]:
path = 'eval_results/xnli/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_xnli_seed98'))
byt5 = byt5.drop(columns='Language Code')
byt5 = compute_weighted_averages(byt5, split_col='Language')

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_xnli_seed73'))
mrt5 = mrt5.drop(columns='Language Code')
mrt5 = compute_weighted_averages(mrt5, split_col='Language')

In [13]:
byt5

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,76.47293,0.0,8.902678,5010
1,French,53.901274,0.0,10.701394,5010
2,Spanish,55.991242,0.0,10.107771,5010
3,German,45.680732,0.0,10.416175,5010
4,Greek,52.01035,0.0,19.023875,5010
5,Bulgarian,56.309713,0.0,17.53938,5010
6,Russian,55.354299,0.0,17.965874,5010
7,Turkish,44.86465,0.0,9.710111,5010
8,Arabic,51.890924,0.0,14.064874,5010
9,Vietnamese,48.626592,0.0,12.564745,5010


In [14]:
mrt5

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,78.881369,52.561442,5.415893,5010
1,French,52.249204,47.57979,6.676275,5010
2,Spanish,53.841561,50.670281,6.088465,5010
3,German,46.198248,47.605549,6.573175,5010
4,Greek,52.328822,54.794956,9.758024,5010
5,Bulgarian,53.164809,47.816622,9.684142,5010
6,Russian,55.851911,47.947454,9.918791,5010
7,Turkish,43.312102,46.717308,6.023513,5010
8,Arabic,47.292994,43.781649,8.277949,5010
9,Vietnamese,47.392516,46.50611,7.633973,5010


In [15]:
(mrt5['Eval Runtime'] - byt5['Eval Runtime']) / byt5['Eval Runtime'] * -100

0     39.165571
1     37.613035
2     39.764514
3     36.894540
4     48.706432
5     44.786292
6     44.790934
7     37.966587
8     41.144523
9     39.242912
10    53.637700
11    26.784580
12    44.219706
13    38.106537
14    37.594283
15    42.544025
Name: Eval Runtime, dtype: float64