In [1]:
import pandas as pd

In [2]:
def compute_weighted_averages(df, split_col='Split'):
    # Compute the total size
    total_size = df['Size'].sum()
    
    # Calculate the weighted averages for each metric
    weighted_averages = {}
    for col in df.columns[1:-1]:  # Exclude 'Split' and 'Size' columns
        weighted_averages[col] = (df[col] * df['Size']).sum() / total_size
    
    # Create a new row for the 'total' split with the weighted averages
    total_row = pd.DataFrame({split_col: ['total'], **weighted_averages, 'Size': [total_size]})
    
    # Append the total row to the dataframe using concat
    df_total = pd.concat([df, total_row], ignore_index=True)
    
    return df_total

def get_percent_decrease(byt5_df, other_df):
    return round((byt5_df['Eval Runtime'] - other_df['Eval Runtime']) / byt5_df['Eval Runtime'] * 100, 2)

## Spelling Correction Task

In [None]:
path = 'eval_results/character_tasks/spelling_correction_contextual/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_spelling_correction_contextual_seed45'))
byt5 = compute_weighted_averages(byt5)

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_spelling_correction_contextual_50%_seed392'))
mrt5 = compute_weighted_averages(mrt5)

# BPT5 DF 
bpt5 = pd.read_csv(path.format('BPT5', 'bpt5_spelling_correction_contextual_50%_seed941'))
bpt5 = compute_weighted_averages(bpt5)

# CPT5 DF 
cpt5 = pd.read_csv(path.format('CanineT5', 'caninet5_spelling_correction_contextual_50%_seed473'))
cpt5 = compute_weighted_averages(cpt5)


In [4]:
byt5.round(2)

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,49.41,0.0,3.86,4163
1,test_context_independent,82.11,0.0,3.9,4685
2,total,66.73,0.0,3.88,8848


In [5]:
mrt5.round(2)

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,44.2,50.85,2.98,4163
1,test_context_independent,80.04,50.26,3.04,4685
2,total,63.18,50.54,3.01,8848


In [6]:
bpt5.round(2)

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,36.51,49.34,3.2,4163
1,test_context_independent,71.08,49.25,3.28,4685
2,total,54.81,49.3,3.24,8848


In [7]:
cpt5.round(2)

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_context_dependent,37.4,50.0,2.94,4163
1,test_context_independent,74.96,50.0,3.02,4685
2,total,57.29,50.0,2.98,8848


In [8]:
get_percent_decrease(byt5, mrt5)

0    22.59
1    22.05
2    22.30
Name: Eval Runtime, dtype: float64

In [9]:
get_percent_decrease(byt5, bpt5)

0    16.99
1    16.04
2    16.48
Name: Eval Runtime, dtype: float64

In [10]:
get_percent_decrease(byt5, cpt5)

0    23.72
1    22.62
2    23.13
Name: Eval Runtime, dtype: float64

## Word Search Task

In [None]:
path = 'eval_results/character_tasks/word_search/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_word_search_seed462'))
byt5 = compute_weighted_averages(byt5, split_col='Split')

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_word_search_70%_seed54'))
mrt5 = compute_weighted_averages(mrt5, split_col='Split')

# BPT5
bpt5 = pd.read_csv(path.format('BPT5', 'bpt5_word_search_70%_seed13'))
bpt5 = compute_weighted_averages(bpt5, split_col='Split')

# CanineT5
caninet5 = pd.read_csv(path.format('CanineT5', 'caninet5_word_search_75%_seed857'))
caninet5 = compute_weighted_averages(caninet5, split_col='Split')

In [12]:
byt5.round(2)

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,78.49,0.0,6.56,1367
1,test_paraphrase,85.92,0.0,6.76,6404
2,test_overlap,77.31,0.0,6.77,5466
3,test_paraphrase_overlap,60.37,0.0,6.76,4068
4,total,76.61,0.0,6.75,17305


In [13]:
mrt5.round(2)

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,73.96,71.77,2.63,1367
1,test_paraphrase,81.51,71.88,2.75,6404
2,test_overlap,72.72,77.69,2.76,5466
3,test_paraphrase_overlap,55.48,75.4,2.77,4068
4,total,72.02,74.53,2.75,17305


In [14]:
bpt5.round(2)

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,78.42,69.95,3.62,1367
1,test_paraphrase,81.84,69.72,3.75,6404
2,test_overlap,77.41,69.8,3.8,5466
3,test_paraphrase_overlap,57.01,69.78,3.8,4068
4,total,74.33,69.78,3.77,17305


In [15]:
caninet5.round(2)

Unnamed: 0,Split,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,test_oov,71.25,75.0,3.02,1367
1,test_paraphrase,72.3,75.0,3.14,6404
2,test_overlap,74.86,75.0,3.18,5466
3,test_paraphrase_overlap,51.89,75.0,3.18,4068
4,total,68.23,75.0,3.15,17305


In [16]:
get_percent_decrease(byt5, mrt5)

0    59.90
1    59.39
2    59.19
3    59.06
4    59.29
Name: Eval Runtime, dtype: float64

In [17]:
get_percent_decrease(byt5, bpt5)

0    44.79
1    44.51
2    43.92
3    43.82
4    44.18
Name: Eval Runtime, dtype: float64

In [18]:
get_percent_decrease(byt5, caninet5)

0    53.98
1    53.52
2    53.06
3    53.00
4    53.29
Name: Eval Runtime, dtype: float64

## XNLI

In [19]:
path = 'eval_results/xnli/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_xnli_seed57'))
byt5 = byt5.drop(columns='Language Code')
byt5 = compute_weighted_averages(byt5, split_col='Language')

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_xnli_50%_seed14'))
mrt5 = mrt5.drop(columns='Language Code')
mrt5 = compute_weighted_averages(mrt5, split_col='Language')

# BPT5
bpt5 = pd.read_csv(path.format('BPT5', 'bpt5_xnli_50%_seed88'))
bpt5 = bpt5.drop(columns='Language Code')
bpt5 = compute_weighted_averages(bpt5, split_col='Language')

# CanineT5
caninet5 = pd.read_csv(path.format('CanineT5', 'caninet5_xnli_50%_seed11'))
caninet5 = caninet5.drop(columns='Language Code')
caninet5 = compute_weighted_averages(caninet5, split_col='Language')

In [20]:
byt5.round(2)

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,80.3,0.0,9.01,5010
1,French,73.93,0.0,10.76,5010
2,Spanish,74.85,0.0,10.15,5010
3,German,69.58,0.0,10.44,5010
4,Greek,64.73,0.0,18.81,5010
5,Bulgarian,67.47,0.0,17.37,5010
6,Russian,64.21,0.0,17.8,5010
7,Turkish,61.68,0.0,9.77,5010
8,Arabic,62.97,0.0,14.0,5010
9,Vietnamese,67.37,0.0,12.54,5010


In [21]:
mrt5.round(2)

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,80.2,50.22,5.62,5010
1,French,73.03,50.21,6.67,5010
2,Spanish,74.25,51.88,6.15,5010
3,German,69.7,47.04,6.77,5010
4,Greek,65.03,63.65,9.32,5010
5,Bulgarian,68.14,63.57,8.63,5010
6,Russian,66.25,64.32,8.72,5010
7,Turkish,63.11,48.48,6.22,5010
8,Arabic,63.61,57.98,7.82,5010
9,Vietnamese,66.57,51.35,7.64,5010


In [22]:
bpt5.round(2)

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,78.9,43.38,7.33,5010
1,French,69.02,48.5,8.21,5010
2,Spanish,69.78,48.11,7.79,5010
3,German,63.61,47.97,8.01,5010
4,Greek,57.33,65.51,10.75,5010
5,Bulgarian,61.88,60.49,10.78,5010
6,Russian,60.48,61.27,10.85,5010
7,Turkish,58.88,43.46,8.13,5010
8,Arabic,57.29,54.38,9.57,5010
9,Vietnamese,58.64,53.83,8.75,5010


In [23]:
caninet5.round(2)

Unnamed: 0,Language,Eval Sequence Accuracy,Eval Percent Deleted Tokens,Eval Runtime,Size
0,English,73.53,50.1,5.9,5010
1,French,55.19,50.08,7.02,5010
2,Spanish,59.78,50.09,6.63,5010
3,German,50.66,50.08,6.82,5010
4,Greek,46.53,50.05,11.79,5010
5,Bulgarian,52.79,50.05,10.96,5010
6,Russian,51.46,50.06,11.2,5010
7,Turkish,47.31,50.08,6.4,5010
8,Arabic,49.34,50.06,8.96,5010
9,Vietnamese,47.92,50.07,8.1,5010


In [24]:
get_percent_decrease(byt5, mrt5)

0     37.64
1     38.04
2     39.44
3     35.12
4     50.48
5     50.30
6     50.99
7     36.37
8     44.15
9     39.10
10    52.45
11    38.89
12    52.43
13    35.23
14    43.01
15    45.13
Name: Eval Runtime, dtype: float64

In [25]:
get_percent_decrease(byt5, bpt5)

0     18.63
1     23.69
2     23.29
3     23.28
4     42.85
5     37.96
6     39.03
7     16.85
8     31.63
9     30.20
10    50.57
11    13.67
12    40.14
13    18.16
14    32.39
15    33.22
Name: Eval Runtime, dtype: float64

In [26]:
get_percent_decrease(byt5, caninet5)

0     34.50
1     34.75
2     34.63
3     34.66
4     37.32
5     36.89
6     37.07
7     34.49
8     36.01
9     35.41
10    38.23
11    34.44
12    38.17
13    34.23
14    36.44
15    36.32
Name: Eval Runtime, dtype: float64

## TyDiQA-GoldP

In [27]:
path = 'eval_results/tydiqa/{}/{}.csv'

# ByT5 DF
byt5 = pd.read_csv(path.format('T5', 't5_tydiqa_seed25'))
byt5 = byt5.drop(columns='Language Code')
byt5 = compute_weighted_averages(byt5, split_col='Language')

# MrT5 DF
mrt5 = pd.read_csv(path.format('MrT5', 'mrt5_tydiqa_50%_seed73'))
mrt5 = mrt5.drop(columns='Language Code')
mrt5 = compute_weighted_averages(mrt5, split_col='Language')

# BPT5
bpt5 = pd.read_csv(path.format('BPT5', 'bpt5_tydiqa_50%_seed960'))
bpt5 = bpt5.drop(columns='Language Code')
bpt5 = compute_weighted_averages(bpt5, split_col='Language')

# CanineT5
caninet5 = pd.read_csv(path.format('CanineT5', 'caninet5_tydiqa_50%_seed25'))
caninet5 = caninet5.drop(columns='Language Code')
caninet5 = compute_weighted_averages(caninet5, split_col='Language')

In [28]:
byt5.round(2) # ByT5

Unnamed: 0,Language,Eval Exact Match,Eval F1,Eval Percent Deleted Tokens,Eval Runtime,Size
0,Russian,65.02,75.64,0.0,45.87,812
1,Arabic,69.27,81.84,0.0,39.13,921
2,Bengali,55.75,67.22,0.0,77.87,113
3,Telugu,77.88,85.41,0.0,57.22,669
4,Finnish,69.18,78.92,0.0,26.31,782
5,Swahili,77.96,85.28,0.0,19.0,499
6,Korean,58.7,66.26,0.0,31.37,276
7,Indonesian,75.58,84.23,0.0,28.76,565
8,English,63.64,73.5,0.0,31.05,440
9,total,69.9,79.58,0.0,37.22,5077


In [29]:
mrt5.round(2) # MrT5

Unnamed: 0,Language,Eval Exact Match,Eval F1,Eval Percent Deleted Tokens,Eval Runtime,Size
0,Russian,60.59,71.41,54.7,28.44,812
1,Arabic,68.62,80.98,56.09,24.9,921
2,Bengali,56.64,66.08,66.68,38.88,113
3,Telugu,77.43,84.87,65.33,29.96,669
4,Finnish,67.9,76.97,31.19,22.28,782
5,Swahili,76.55,82.77,40.68,17.04,499
6,Korean,57.61,65.66,32.77,25.28,276
7,Indonesian,73.1,83.19,39.27,22.16,565
8,English,62.5,70.93,40.48,23.07,440
9,total,68.27,77.73,47.48,24.83,5077


In [None]:
bpt5.round(2) # BP

Unnamed: 0,Language,Eval Exact Match,Eval F1,Eval Percent Deleted Tokens,Eval Runtime,Size
0,Russian,31.9,43.27,30.6,36.87,812
1,Arabic,40.17,59.72,25.37,33.29,921
2,Bengali,23.01,31.41,43.21,57.42,113
3,Telugu,29.9,42.4,36.34,44.84,669
4,Finnish,45.65,58.88,13.88,25.97,782
5,Swahili,63.33,71.95,5.39,20.84,499
6,Korean,31.88,40.07,17.0,29.42,276
7,Indonesian,50.09,65.11,16.61,27.04,565
8,English,36.82,51.2,21.48,28.45,440
9,total,40.59,54.04,22.55,32.24,5077


In [None]:
caninet5.round(2) # CP

Unnamed: 0,Language,Eval Exact Match,Eval F1,Eval Percent Deleted Tokens,Eval Runtime,Size
0,Russian,49.63,60.66,50.03,29.6,812
1,Arabic,60.69,75.44,50.04,26.12,921
2,Bengali,38.94,51.24,50.01,47.51,113
3,Telugu,59.04,68.78,50.03,35.7,669
4,Finnish,54.22,65.76,50.05,19.71,782
5,Swahili,60.92,67.89,50.09,16.92,499
6,Korean,38.04,44.67,50.04,22.3,276
7,Indonesian,61.06,72.1,50.06,20.81,565
8,English,48.41,57.79,50.04,21.48,440
9,total,54.99,65.85,50.05,25.32,5077


In [32]:
get_percent_decrease(byt5, mrt5)

0    38.00
1    36.36
2    50.06
3    47.65
4    15.31
5    10.33
6    19.43
7    22.93
8    25.69
9    33.31
Name: Eval Runtime, dtype: float64

In [33]:
get_percent_decrease(byt5, bpt5)

0    19.62
1    14.94
2    26.26
3    21.63
4     1.30
5    -9.68
6     6.22
7     5.98
8     8.36
9    13.38
Name: Eval Runtime, dtype: float64

In [34]:
get_percent_decrease(byt5, caninet5)

0    35.46
1    33.27
2    38.99
3    37.60
4    25.09
5    10.93
6    28.91
7    27.64
8    30.80
9    31.97
Name: Eval Runtime, dtype: float64