In [1]:
import pandas as pd
from IPython.display import display, HTML

pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
def display_stats(dataframe, dataframe2=None, order_by="translator_name", heading=None, compare_to_column=None):
    df = dataframe.copy()
    df.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
    df['quality_vs_tb'] = df['cosine_similarity_vs_source'] - df['cosine_similarity_original_translation']
    
    if dataframe2 is not None:
        df2 = dataframe2.copy()
        df2.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
        df2['quality_vs_tb'] = df2['cosine_similarity_vs_source'] - df2['cosine_similarity_original_translation']
    
    def make_quantile(q):
        return lambda x: x.quantile(q)
    
    percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
    agg_funcs = ['mean', 'min', 'max'] + [make_quantile(q) for q in percentiles]
    agg_names = ['mean', 'min', 'max'] + [f"{int(q*100)}%" for q in percentiles]
    
    quality_stats = df.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    )
    
    similarity_stats = df.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    )
    
    if dataframe2 is not None and compare_to_column:
        raise ValueError('try again. please only choose comparison vs column or dataframe')
        
    if heading:
        display(HTML(f"<h3>{heading}</h3>"))
        
    if dataframe2 is not None:
        quality_stats2 = df2.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        similarity_stats2 = df2.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        
        quality_diff = quality_stats.merge(quality_stats2, on='translator_name', suffixes=('', '_2'))
        for col in agg_names:
            quality_diff[col] = quality_diff[col] - quality_diff[col + '_2']
        quality_diff = quality_diff[['translator_name'] + agg_names]
        
        similarity_diff = similarity_stats.merge(similarity_stats2, on='translator_name', suffixes=('', '_2'))
        for col in agg_names:
            similarity_diff[col] = similarity_diff[col] - similarity_diff[col + '_2']
        similarity_diff = similarity_diff[['translator_name'] + agg_names]
        
        display(HTML("<h4>Difference in Quality Versus Translation Bureau</h4>"))
        display(quality_diff.sort_values(order_by, ascending=False))
        
        display(HTML("<h4>Difference in Similarity to Translation Bureau Translation</h4>"))
        display(similarity_diff.sort_values(order_by, ascending=False))
        
    if compare_to_column:
        if compare_to_column not in df['translator_name'].values:
            raise ValueError(f"compare_to_column '{compare_to_column}' not found in translator_name column")
    
        comparison_df = df[df['translator_name'] == compare_to_column][['source', 'quality_vs_tb', 'similarity_to_old_translation']].copy()
        comparison_df = comparison_df.rename(columns={
            'quality_vs_tb': 'quality_vs_tb_comparison',
            'similarity_to_old_translation': 'similarity_to_old_translation_comparison'
        })
        
        df_with_comparison = df.merge(comparison_df, on='source', how='left')
        
        df_with_comparison['quality_diff'] = df_with_comparison['quality_vs_tb'] - df_with_comparison['quality_vs_tb_comparison']
        df_with_comparison['similarity_diff'] = df_with_comparison['similarity_to_old_translation'] - df_with_comparison['similarity_to_old_translation_comparison']
        
        quality_diff = df_with_comparison.groupby('translator_name')['quality_diff'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        
        similarity_diff = df_with_comparison.groupby('translator_name')['similarity_diff'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        
        display(HTML(f"<h4>Difference in Quality Versus {compare_to_column}</h4>"))
        display(quality_diff.sort_values(order_by, ascending=False))
        
        display(HTML(f"<h4>Difference in Similarity to Translation Bureau Translation Versus {compare_to_column}</h4>"))
        display(similarity_diff.sort_values(order_by, ascending=False))
        
    else:
        display(HTML("<h4>Quality of Translations Versus Translation Bureau</h4>"))
        display(quality_stats.sort_values(order_by, ascending=False))
        
        display(HTML("<h4>Similarity to Translation Bureau Translation</h4>"))
        display(similarity_stats.sort_values(order_by, ascending=False))


In [3]:
df_test_no_rules = pd.read_csv("translation_results/translation_comparison_test_no_rules_20250905-1430.csv")
df_test_rules = pd.read_csv("translation_results/translation_comparison_test_rules_20250906-0529.csv")
df_test_rules_v2 = pd.read_csv("translation_results/translation_comparison_test_rules_v2_20250906-2158.csv")
df_train_no_rules = pd.read_csv("translation_results/translation_comparison_train_no_rules_20250905-2228.csv")
df_train_rules = pd.read_csv("translation_results/translation_comparison_train_rules_20250906-1419.csv")
df_train_rules_v2 = pd.read_csv("translation_results/translation_comparison_train_rules_v2_20250907-0649.csv")

df_test_rules_error_data = pd.read_json("translation_results/translation_errors_test_rules_20250906-0529.json").T
df_test_rules_v2_error_data = pd.read_json("translation_results/translation_errors_test_rules_v2_20250906-2158.json").T
df_train_rules_error_data = pd.read_json("translation_results/translation_errors_train_rules_20250906-1419.json").T
df_train_rules_v2_error_data = pd.read_json("translation_results/translation_errors_train_rules_v2_20250907-0649.json").T

test_rules_error_list = df_test_rules_error_data.index.to_list()
test_rules_v2_error_list = df_test_rules_v2_error_data.index.to_list()
train_rules_error_list = df_train_rules_error_data.index.to_list()
train_rules_v2_error_list = df_train_rules_v2_error_data.index.to_list()

for df, error_list in [
    (df_test_no_rules, list()), 
    (df_test_rules, test_rules_error_list), 
    (df_test_rules_v2, test_rules_v2_error_list), 
    (df_train_no_rules, list()), 
    (df_train_rules, train_rules_error_list), 
    (df_train_rules_v2, train_rules_v2_error_list)
]:
    df['error'] = False
    df.iloc[
        [x for sample in error_list for x in range((sample - 1) * 7, sample * 7)], 
        df.columns.get_loc('error')
    ] = True


In [4]:
for x in [test_rules_error_list, test_rules_v2_error_list, train_rules_error_list, train_rules_v2_error_list]:
    print(len(x))

914
937
572
542


 # compare quality under different conditions

In [5]:
display_stats(df_test_no_rules, heading="\nTest Data - no find and replace\n")
display_stats(df_test_rules, heading="\nTest Data - with preferential translations\n")
display_stats(df_test_rules_v2, heading="\nTest Data - with preferential translations - finetuned for find and replace\n")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.04,-0.47,0.64,-0.07,-0.02,0.03,0.13,0.33
5,opus_mt_base,0.05,-0.27,0.63,-0.07,-0.02,0.03,0.15,0.33
4,nllb_3b_base_researchonly,0.05,-0.97,0.65,-0.16,-0.02,0.03,0.15,0.33
3,mbart50_mmt_finetuned,0.05,-0.47,0.63,-0.08,-0.02,0.03,0.14,0.32
2,mbart50_mmt_base,0.05,-0.66,0.67,-0.08,-0.02,0.03,0.15,0.34
1,m2m100_418m_finetuned,0.05,-0.66,0.64,-0.08,-0.02,0.03,0.13,0.32
0,m2m100_418m_base,0.06,-0.76,0.67,-0.11,-0.02,0.04,0.16,0.34


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.9,-0.02,1.0,0.54,0.8,0.93,0.98,1.0
5,opus_mt_base,0.9,0.16,1.0,0.59,0.79,0.92,0.98,1.0
4,nllb_3b_base_researchonly,0.89,-0.08,1.0,0.49,0.77,0.91,0.97,1.0
3,mbart50_mmt_finetuned,0.9,-0.02,1.0,0.53,0.8,0.92,0.98,1.0
2,mbart50_mmt_base,0.89,0.19,1.0,0.58,0.79,0.91,0.97,1.0
1,m2m100_418m_finetuned,0.89,-0.02,1.0,0.53,0.79,0.92,0.98,1.0
0,m2m100_418m_base,0.87,0.09,1.0,0.51,0.76,0.89,0.96,0.99


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.05,-0.56,0.69,-0.07,-0.02,0.03,0.14,0.34
5,opus_mt_base,0.05,-0.28,0.68,-0.07,-0.02,0.03,0.15,0.36
4,nllb_3b_base_researchonly,0.05,-0.86,0.63,-0.13,-0.02,0.04,0.15,0.35
3,mbart50_mmt_finetuned,0.05,-0.7,0.72,-0.08,-0.02,0.03,0.14,0.34
2,mbart50_mmt_base,0.05,-0.31,0.73,-0.08,-0.02,0.03,0.15,0.36
1,m2m100_418m_finetuned,0.05,-0.78,0.71,-0.08,-0.02,0.03,0.14,0.34
0,m2m100_418m_base,0.06,-0.87,0.71,-0.1,-0.01,0.04,0.16,0.36


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.9,0.03,1.0,0.53,0.8,0.93,0.98,1.0
5,opus_mt_base,0.9,0.24,1.0,0.56,0.79,0.92,0.97,1.0
4,nllb_3b_base_researchonly,0.89,-0.06,1.0,0.52,0.78,0.91,0.97,1.0
3,mbart50_mmt_finetuned,0.9,0.02,1.0,0.53,0.79,0.92,0.98,1.0
2,mbart50_mmt_base,0.89,0.29,1.0,0.55,0.78,0.91,0.97,0.99
1,m2m100_418m_finetuned,0.89,0.02,1.0,0.51,0.79,0.92,0.98,1.0
0,m2m100_418m_base,0.87,0.03,1.0,0.51,0.76,0.89,0.96,0.99


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.05,-0.56,0.67,-0.07,-0.02,0.03,0.13,0.34
5,opus_mt_base,0.05,-0.35,0.65,-0.07,-0.02,0.03,0.15,0.36
4,nllb_3b_base_researchonly,0.05,-0.93,0.67,-0.13,-0.02,0.04,0.15,0.35
3,mbart50_mmt_finetuned,0.05,-0.56,0.64,-0.07,-0.02,0.03,0.14,0.34
2,mbart50_mmt_base,0.05,-0.55,0.67,-0.08,-0.02,0.04,0.15,0.36
1,m2m100_418m_finetuned,0.05,-0.62,0.68,-0.08,-0.02,0.03,0.14,0.35
0,m2m100_418m_base,0.06,-0.82,0.67,-0.1,-0.02,0.04,0.16,0.37


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.9,0.01,1.0,0.53,0.79,0.93,0.98,1.0
5,opus_mt_base,0.9,0.29,1.0,0.57,0.79,0.92,0.97,1.0
4,nllb_3b_base_researchonly,0.89,-0.08,1.0,0.49,0.78,0.91,0.97,1.0
3,mbart50_mmt_finetuned,0.9,0.01,1.0,0.52,0.79,0.92,0.98,1.0
2,mbart50_mmt_base,0.89,0.19,1.0,0.57,0.79,0.91,0.97,0.99
1,m2m100_418m_finetuned,0.89,0.01,1.0,0.52,0.79,0.92,0.97,1.0
0,m2m100_418m_base,0.87,0.03,1.0,0.49,0.76,0.89,0.96,0.99


In [6]:
display_stats(df_train_no_rules, heading="\nTraining Data - no find and replace\n")
display_stats(df_train_rules, heading="\nTraining Data - with preferential translations\n")
display_stats(df_train_rules_v2, heading="\nTraining Data - with preferential translations - finetuned for find and replace\n")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.02,-0.51,0.28,-0.04,-0.01,0.01,0.07,0.15
5,opus_mt_base,0.03,-0.3,0.28,-0.05,-0.01,0.02,0.08,0.16
4,nllb_3b_base_researchonly,0.03,-0.8,0.33,-0.1,-0.01,0.02,0.08,0.16
3,mbart50_mmt_finetuned,0.02,-0.5,0.28,-0.05,-0.01,0.01,0.07,0.15
2,mbart50_mmt_base,0.03,-0.65,0.31,-0.05,-0.01,0.02,0.08,0.16
1,m2m100_418m_finetuned,0.02,-0.69,0.31,-0.05,-0.01,0.02,0.07,0.15
0,m2m100_418m_base,0.03,-0.6,0.34,-0.09,-0.02,0.02,0.08,0.16


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.95,0.03,1.0,0.79,0.89,0.96,1.0,1.0
5,opus_mt_base,0.94,0.58,1.0,0.78,0.88,0.95,0.99,1.0
4,nllb_3b_base_researchonly,0.93,-0.03,1.0,0.74,0.87,0.95,0.99,1.0
3,mbart50_mmt_finetuned,0.95,0.03,1.0,0.78,0.89,0.96,0.99,1.0
2,mbart50_mmt_base,0.93,0.21,1.0,0.77,0.87,0.95,0.99,1.0
1,m2m100_418m_finetuned,0.94,0.03,1.0,0.77,0.88,0.96,0.99,1.0
0,m2m100_418m_base,0.92,0.32,1.0,0.74,0.85,0.93,0.98,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.03,-0.26,0.81,-0.04,-0.01,0.02,0.07,0.16
5,opus_mt_base,0.03,-0.25,0.81,-0.05,-0.01,0.02,0.08,0.17
4,nllb_3b_base_researchonly,0.03,-0.79,0.82,-0.1,-0.01,0.02,0.08,0.17
3,mbart50_mmt_finetuned,0.03,-0.38,0.81,-0.04,-0.01,0.02,0.07,0.16
2,mbart50_mmt_base,0.03,-0.74,0.81,-0.05,-0.01,0.02,0.08,0.16
1,m2m100_418m_finetuned,0.03,-0.69,0.79,-0.05,-0.01,0.02,0.07,0.16
0,m2m100_418m_base,0.03,-0.53,0.81,-0.09,-0.02,0.02,0.08,0.17


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.95,0.18,1.0,0.79,0.89,0.96,1.0,1.0
5,opus_mt_base,0.94,0.18,1.0,0.77,0.88,0.95,0.99,1.0
4,nllb_3b_base_researchonly,0.93,-0.03,1.0,0.74,0.87,0.95,0.99,1.0
3,mbart50_mmt_finetuned,0.95,0.19,1.0,0.78,0.89,0.96,0.99,1.0
2,mbart50_mmt_base,0.93,0.18,1.0,0.76,0.87,0.94,0.99,1.0
1,m2m100_418m_finetuned,0.94,0.2,1.0,0.77,0.88,0.95,0.99,1.0
0,m2m100_418m_base,0.92,0.2,1.0,0.74,0.85,0.93,0.98,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.02,-0.51,0.34,-0.04,-0.01,0.02,0.07,0.15
5,opus_mt_base,0.03,-0.17,0.34,-0.05,-0.01,0.02,0.08,0.16
4,nllb_3b_base_researchonly,0.03,-0.8,0.34,-0.08,-0.01,0.02,0.08,0.16
3,mbart50_mmt_finetuned,0.03,-0.54,0.34,-0.05,-0.01,0.02,0.07,0.15
2,mbart50_mmt_base,0.03,-0.71,0.33,-0.05,-0.01,0.02,0.08,0.16
1,m2m100_418m_finetuned,0.03,-0.51,0.33,-0.05,-0.01,0.02,0.07,0.15
0,m2m100_418m_base,0.03,-0.74,0.33,-0.08,-0.02,0.02,0.08,0.16


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.95,0.22,1.0,0.79,0.89,0.96,1.0,1.0
5,opus_mt_base,0.94,0.61,1.0,0.78,0.88,0.95,0.99,1.0
4,nllb_3b_base_researchonly,0.93,0.15,1.0,0.76,0.87,0.95,0.99,1.0
3,mbart50_mmt_finetuned,0.95,0.22,1.0,0.79,0.89,0.96,0.99,1.0
2,mbart50_mmt_base,0.93,0.25,1.0,0.77,0.87,0.94,0.99,1.0
1,m2m100_418m_finetuned,0.94,0.1,1.0,0.78,0.88,0.95,0.99,1.0
0,m2m100_418m_base,0.92,0.25,1.0,0.74,0.85,0.93,0.98,1.0


# comparison of each model

In [7]:
finetuned_translators = ['opus_mt_finetuned', 'mbart50_mmt_finetuned', 'm2m100_418m_finetuned']

In [8]:
display_stats(
    df_test_rules[df_test_rules.translator_name.isin(finetuned_translators)], 
    df_test_no_rules[df_test_no_rules.translator_name.isin(finetuned_translators)], 
    heading="\nTest Data - find and replace replace minus no find and\n"
)
display_stats(
    df_test_rules_v2[df_test_rules_v2.translator_name.isin(finetuned_translators)], 
    df_test_rules[df_test_rules.translator_name.isin(finetuned_translators)], 
    heading="\nTest Data - finetuned find and replace minus find and replace\n"
)
display_stats(
    df_test_rules_v2[df_test_rules_v2.translator_name.isin(finetuned_translators)], 
    df_test_no_rules[df_test_no_rules.translator_name.isin(finetuned_translators)], 
    heading="\nTest Data - finetuned find and replace minus no find and replace\n"
)

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.0,-0.09,0.05,0.0,0.0,0.0,0.0,0.01
1,mbart50_mmt_finetuned,0.0,-0.23,0.09,0.01,0.0,0.0,0.0,0.01
0,m2m100_418m_finetuned,0.0,-0.11,0.07,0.0,0.0,-0.0,0.01,0.02


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.04,0.0,-0.01,-0.0,-0.0,-0.0,-0.0
1,mbart50_mmt_finetuned,-0.0,0.03,0.0,-0.0,-0.0,0.0,-0.0,-0.0
0,m2m100_418m_finetuned,0.0,0.03,0.0,-0.03,-0.0,0.0,-0.0,-0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.05,-0.56,0.69,-0.07,-0.02,0.03,0.14,0.34
1,mbart50_mmt_finetuned,0.05,-0.7,0.72,-0.08,-0.02,0.03,0.14,0.34
0,m2m100_418m_finetuned,0.05,-0.78,0.71,-0.08,-0.02,0.03,0.14,0.34


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.9,0.03,1.0,0.53,0.8,0.93,0.98,1.0
1,mbart50_mmt_finetuned,0.9,0.02,1.0,0.53,0.79,0.92,0.98,1.0
0,m2m100_418m_finetuned,0.89,0.02,1.0,0.51,0.79,0.92,0.98,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.0,0.0,-0.02,-0.0,-0.0,0.0,-0.0,0.0
1,mbart50_mmt_finetuned,0.0,0.14,-0.08,0.0,-0.0,0.0,0.0,0.0
0,m2m100_418m_finetuned,0.0,0.16,-0.03,-0.0,0.0,0.0,-0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,-0.02,0.0,-0.0,-0.0,-0.0,-0.0,-0.0
1,mbart50_mmt_finetuned,-0.0,-0.01,0.0,-0.0,-0.0,-0.0,-0.0,0.0
0,m2m100_418m_finetuned,-0.0,-0.01,-0.0,0.01,-0.0,-0.0,-0.0,-0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.05,-0.56,0.67,-0.07,-0.02,0.03,0.13,0.34
1,mbart50_mmt_finetuned,0.05,-0.56,0.64,-0.07,-0.02,0.03,0.14,0.34
0,m2m100_418m_finetuned,0.05,-0.62,0.68,-0.08,-0.02,0.03,0.14,0.35


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.9,0.01,1.0,0.53,0.79,0.93,0.98,1.0
1,mbart50_mmt_finetuned,0.9,0.01,1.0,0.52,0.79,0.92,0.98,1.0
0,m2m100_418m_finetuned,0.89,0.01,1.0,0.52,0.79,0.92,0.97,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.0,-0.09,0.03,0.0,0.0,0.0,-0.0,0.02
1,mbart50_mmt_finetuned,0.0,-0.09,0.01,0.01,0.0,0.0,0.0,0.02
0,m2m100_418m_finetuned,0.0,0.04,0.04,0.0,0.0,0.0,0.0,0.02


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.02,0.0,-0.01,-0.01,-0.0,-0.0,-0.0
1,mbart50_mmt_finetuned,-0.0,0.02,0.0,-0.01,-0.0,-0.0,-0.0,-0.0
0,m2m100_418m_finetuned,-0.0,0.02,-0.0,-0.02,-0.0,-0.0,-0.0,-0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.05,-0.56,0.67,-0.07,-0.02,0.03,0.13,0.34
1,mbart50_mmt_finetuned,0.05,-0.56,0.64,-0.07,-0.02,0.03,0.14,0.34
0,m2m100_418m_finetuned,0.05,-0.62,0.68,-0.08,-0.02,0.03,0.14,0.35


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.9,0.01,1.0,0.53,0.79,0.93,0.98,1.0
1,mbart50_mmt_finetuned,0.9,0.01,1.0,0.52,0.79,0.92,0.98,1.0
0,m2m100_418m_finetuned,0.89,0.01,1.0,0.52,0.79,0.92,0.97,1.0


In [9]:
display_stats(
    df_train_rules[df_train_rules.translator_name.isin(finetuned_translators)],
    df_train_no_rules[df_train_no_rules.translator_name.isin(finetuned_translators)], 
    heading="\nTraining Data - find and replace replace minus no find and\n"
)
display_stats(
    df_train_rules_v2[df_train_rules_v2.translator_name.isin(finetuned_translators)],
    df_train_rules[df_train_rules.translator_name.isin(finetuned_translators)], 
    heading="\nTraining Data - finetuned find and replace minus find and replace\n"
)
display_stats(
    df_train_rules_v2[df_train_rules_v2.translator_name.isin(finetuned_translators)],
    df_train_no_rules[df_train_no_rules.translator_name.isin(finetuned_translators)], 
    heading="\nTraining Data - finetuned find and replace minus no find and replace\n"
)

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.0,0.25,0.53,-0.0,-0.0,0.0,0.0,0.01
1,mbart50_mmt_finetuned,0.0,0.12,0.53,0.0,-0.0,0.0,0.0,0.01
0,m2m100_418m_finetuned,0.0,0.0,0.48,-0.0,-0.0,0.0,0.0,0.01


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.15,0.0,-0.0,-0.0,-0.0,-0.0,0.0
1,mbart50_mmt_finetuned,-0.0,0.16,0.0,-0.0,-0.0,-0.0,-0.0,0.0
0,m2m100_418m_finetuned,-0.0,0.17,0.0,-0.01,-0.0,-0.0,-0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.03,-0.26,0.81,-0.04,-0.01,0.02,0.07,0.16
1,mbart50_mmt_finetuned,0.03,-0.38,0.81,-0.04,-0.01,0.02,0.07,0.16
0,m2m100_418m_finetuned,0.03,-0.69,0.79,-0.05,-0.01,0.02,0.07,0.16


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.95,0.18,1.0,0.79,0.89,0.96,1.0,1.0
1,mbart50_mmt_finetuned,0.95,0.19,1.0,0.78,0.89,0.96,0.99,1.0
0,m2m100_418m_finetuned,0.94,0.2,1.0,0.77,0.88,0.95,0.99,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,-0.25,-0.47,-0.0,-0.0,-0.0,-0.0,-0.01
1,mbart50_mmt_finetuned,-0.0,-0.16,-0.47,-0.01,-0.0,0.0,-0.0,-0.01
0,m2m100_418m_finetuned,0.0,0.18,-0.47,0.0,-0.0,0.0,-0.0,-0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.04,0.0,0.0,0.0,-0.0,-0.0,0.0
1,mbart50_mmt_finetuned,-0.0,0.03,0.0,0.01,0.0,-0.0,-0.0,0.0
0,m2m100_418m_finetuned,0.0,-0.11,0.0,0.01,0.0,-0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.02,-0.51,0.34,-0.04,-0.01,0.02,0.07,0.15
1,mbart50_mmt_finetuned,0.03,-0.54,0.34,-0.05,-0.01,0.02,0.07,0.15
0,m2m100_418m_finetuned,0.03,-0.51,0.33,-0.05,-0.01,0.02,0.07,0.15


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.95,0.22,1.0,0.79,0.89,0.96,1.0,1.0
1,mbart50_mmt_finetuned,0.95,0.22,1.0,0.79,0.89,0.96,0.99,1.0
0,m2m100_418m_finetuned,0.94,0.1,1.0,0.78,0.88,0.95,0.99,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.0,0.0,0.06,-0.0,-0.0,0.0,0.0,-0.0
1,mbart50_mmt_finetuned,0.0,-0.04,0.06,-0.0,-0.0,0.0,0.0,0.0
0,m2m100_418m_finetuned,0.0,0.18,0.02,-0.0,-0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.19,0.0,0.0,-0.0,-0.0,-0.0,0.0
1,mbart50_mmt_finetuned,-0.0,0.19,0.0,0.01,-0.0,-0.0,-0.0,0.0
0,m2m100_418m_finetuned,-0.0,0.06,0.0,0.01,-0.0,-0.0,-0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.02,-0.51,0.34,-0.04,-0.01,0.02,0.07,0.15
1,mbart50_mmt_finetuned,0.03,-0.54,0.34,-0.05,-0.01,0.02,0.07,0.15
0,m2m100_418m_finetuned,0.03,-0.51,0.33,-0.05,-0.01,0.02,0.07,0.15


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.95,0.22,1.0,0.79,0.89,0.96,1.0,1.0
1,mbart50_mmt_finetuned,0.95,0.22,1.0,0.79,0.89,0.96,0.99,1.0
0,m2m100_418m_finetuned,0.94,0.1,1.0,0.78,0.88,0.95,0.99,1.0


# Best Results Mixture of Experts Translation Model Proof of Concept 
### (maybe if we deploy all models and take the best result, we can improve our results)

In [10]:
def add_best_results(dataframe):
    df_temp = dataframe[dataframe.translator_name != 'nllb_3b_base_researchonly'].sort_values('cosine_similarity_vs_source', ascending=False).drop_duplicates(subset='source', keep='first').copy()
    df_temp["translator_name"] = "best_results"
    return pd.concat([dataframe, df_temp]).reset_index(drop=True)

df_test_no_rules = add_best_results(df_test_no_rules)
df_test_rules = add_best_results(df_test_rules)
df_test_rules_v2 = add_best_results(df_test_rules_v2)
df_train_no_rules = add_best_results(df_train_no_rules)
df_train_rules = add_best_results(df_train_rules)
df_train_rules_v2 = add_best_results(df_train_rules_v2)

### Results Including Best Results Mixture of Experts

In [11]:
display_stats(df_test_no_rules, heading="\nTest Data - no find and replace\n")
display_stats(df_test_rules, heading="\nTest Data - with preferential translations\n")
display_stats(df_test_rules_v2, heading="\nTest Data - with preferential translations - finetuned for find and replace\n")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.04,-0.47,0.64,-0.07,-0.02,0.03,0.13,0.33
6,opus_mt_base,0.05,-0.27,0.63,-0.07,-0.02,0.03,0.15,0.33
5,nllb_3b_base_researchonly,0.05,-0.97,0.65,-0.16,-0.02,0.03,0.15,0.33
4,mbart50_mmt_finetuned,0.05,-0.47,0.63,-0.08,-0.02,0.03,0.14,0.32
3,mbart50_mmt_base,0.05,-0.66,0.67,-0.08,-0.02,0.03,0.15,0.34
2,m2m100_418m_finetuned,0.05,-0.66,0.64,-0.08,-0.02,0.03,0.13,0.32
1,m2m100_418m_base,0.06,-0.76,0.67,-0.11,-0.02,0.04,0.16,0.34
0,best_results,0.07,-0.07,0.67,-0.03,0.01,0.05,0.17,0.36


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.9,-0.02,1.0,0.54,0.8,0.93,0.98,1.0
6,opus_mt_base,0.9,0.16,1.0,0.59,0.79,0.92,0.98,1.0
5,nllb_3b_base_researchonly,0.89,-0.08,1.0,0.49,0.77,0.91,0.97,1.0
4,mbart50_mmt_finetuned,0.9,-0.02,1.0,0.53,0.8,0.92,0.98,1.0
3,mbart50_mmt_base,0.89,0.19,1.0,0.58,0.79,0.91,0.97,1.0
2,m2m100_418m_finetuned,0.89,-0.02,1.0,0.53,0.79,0.92,0.98,1.0
1,m2m100_418m_base,0.87,0.09,1.0,0.51,0.76,0.89,0.96,0.99
0,best_results,0.89,0.24,1.0,0.59,0.78,0.91,0.97,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.05,-0.56,0.69,-0.07,-0.02,0.03,0.14,0.34
6,opus_mt_base,0.05,-0.28,0.68,-0.07,-0.02,0.03,0.15,0.36
5,nllb_3b_base_researchonly,0.05,-0.86,0.63,-0.13,-0.02,0.04,0.15,0.35
4,mbart50_mmt_finetuned,0.05,-0.7,0.72,-0.08,-0.02,0.03,0.14,0.34
3,mbart50_mmt_base,0.05,-0.31,0.73,-0.08,-0.02,0.03,0.15,0.36
2,m2m100_418m_finetuned,0.05,-0.78,0.71,-0.08,-0.02,0.03,0.14,0.34
1,m2m100_418m_base,0.06,-0.87,0.71,-0.1,-0.01,0.04,0.16,0.36
0,best_results,0.07,-0.11,0.73,-0.03,0.01,0.05,0.17,0.38


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.9,0.03,1.0,0.53,0.8,0.93,0.98,1.0
6,opus_mt_base,0.9,0.24,1.0,0.56,0.79,0.92,0.97,1.0
5,nllb_3b_base_researchonly,0.89,-0.06,1.0,0.52,0.78,0.91,0.97,1.0
4,mbart50_mmt_finetuned,0.9,0.02,1.0,0.53,0.79,0.92,0.98,1.0
3,mbart50_mmt_base,0.89,0.29,1.0,0.55,0.78,0.91,0.97,0.99
2,m2m100_418m_finetuned,0.89,0.02,1.0,0.51,0.79,0.92,0.98,1.0
1,m2m100_418m_base,0.87,0.03,1.0,0.51,0.76,0.89,0.96,0.99
0,best_results,0.89,0.28,1.0,0.55,0.79,0.91,0.97,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.05,-0.56,0.67,-0.07,-0.02,0.03,0.13,0.34
6,opus_mt_base,0.05,-0.35,0.65,-0.07,-0.02,0.03,0.15,0.36
5,nllb_3b_base_researchonly,0.05,-0.93,0.67,-0.13,-0.02,0.04,0.15,0.35
4,mbart50_mmt_finetuned,0.05,-0.56,0.64,-0.07,-0.02,0.03,0.14,0.34
3,mbart50_mmt_base,0.05,-0.55,0.67,-0.08,-0.02,0.04,0.15,0.36
2,m2m100_418m_finetuned,0.05,-0.62,0.68,-0.08,-0.02,0.03,0.14,0.35
1,m2m100_418m_base,0.06,-0.82,0.67,-0.1,-0.02,0.04,0.16,0.37
0,best_results,0.07,-0.09,0.68,-0.03,0.01,0.05,0.17,0.38


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.9,0.01,1.0,0.53,0.79,0.93,0.98,1.0
6,opus_mt_base,0.9,0.29,1.0,0.57,0.79,0.92,0.97,1.0
5,nllb_3b_base_researchonly,0.89,-0.08,1.0,0.49,0.78,0.91,0.97,1.0
4,mbart50_mmt_finetuned,0.9,0.01,1.0,0.52,0.79,0.92,0.98,1.0
3,mbart50_mmt_base,0.89,0.19,1.0,0.57,0.79,0.91,0.97,0.99
2,m2m100_418m_finetuned,0.89,0.01,1.0,0.52,0.79,0.92,0.97,1.0
1,m2m100_418m_base,0.87,0.03,1.0,0.49,0.76,0.89,0.96,0.99
0,best_results,0.89,0.29,1.0,0.56,0.78,0.91,0.97,1.0


In [12]:
display_stats(df_train_no_rules, heading="\nTraining Data - no find and replace\n")
display_stats(df_train_rules, heading="\nTraining Data - with preferential translations\n")
display_stats(df_train_rules_v2, heading="\nTraining Data - with preferential translations - finetuned for find and replace\n")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.02,-0.51,0.28,-0.04,-0.01,0.01,0.07,0.15
6,opus_mt_base,0.03,-0.3,0.28,-0.05,-0.01,0.02,0.08,0.16
5,nllb_3b_base_researchonly,0.03,-0.8,0.33,-0.1,-0.01,0.02,0.08,0.16
4,mbart50_mmt_finetuned,0.02,-0.5,0.28,-0.05,-0.01,0.01,0.07,0.15
3,mbart50_mmt_base,0.03,-0.65,0.31,-0.05,-0.01,0.02,0.08,0.16
2,m2m100_418m_finetuned,0.02,-0.69,0.31,-0.05,-0.01,0.02,0.07,0.15
1,m2m100_418m_base,0.03,-0.6,0.34,-0.09,-0.02,0.02,0.08,0.16
0,best_results,0.04,-0.06,0.34,-0.01,0.0,0.03,0.09,0.18


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.95,0.03,1.0,0.79,0.89,0.96,1.0,1.0
6,opus_mt_base,0.94,0.58,1.0,0.78,0.88,0.95,0.99,1.0
5,nllb_3b_base_researchonly,0.93,-0.03,1.0,0.74,0.87,0.95,0.99,1.0
4,mbart50_mmt_finetuned,0.95,0.03,1.0,0.78,0.89,0.96,0.99,1.0
3,mbart50_mmt_base,0.93,0.21,1.0,0.77,0.87,0.95,0.99,1.0
2,m2m100_418m_finetuned,0.94,0.03,1.0,0.77,0.88,0.96,0.99,1.0
1,m2m100_418m_base,0.92,0.32,1.0,0.74,0.85,0.93,0.98,1.0
0,best_results,0.94,0.6,1.0,0.77,0.87,0.95,0.99,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.03,-0.26,0.81,-0.04,-0.01,0.02,0.07,0.16
6,opus_mt_base,0.03,-0.25,0.81,-0.05,-0.01,0.02,0.08,0.17
5,nllb_3b_base_researchonly,0.03,-0.79,0.82,-0.1,-0.01,0.02,0.08,0.17
4,mbart50_mmt_finetuned,0.03,-0.38,0.81,-0.04,-0.01,0.02,0.07,0.16
3,mbart50_mmt_base,0.03,-0.74,0.81,-0.05,-0.01,0.02,0.08,0.16
2,m2m100_418m_finetuned,0.03,-0.69,0.79,-0.05,-0.01,0.02,0.07,0.16
1,m2m100_418m_base,0.03,-0.53,0.81,-0.09,-0.02,0.02,0.08,0.17
0,best_results,0.04,-0.13,0.81,-0.01,0.0,0.03,0.09,0.18


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.95,0.18,1.0,0.79,0.89,0.96,1.0,1.0
6,opus_mt_base,0.94,0.18,1.0,0.77,0.88,0.95,0.99,1.0
5,nllb_3b_base_researchonly,0.93,-0.03,1.0,0.74,0.87,0.95,0.99,1.0
4,mbart50_mmt_finetuned,0.95,0.19,1.0,0.78,0.89,0.96,0.99,1.0
3,mbart50_mmt_base,0.93,0.18,1.0,0.76,0.87,0.94,0.99,1.0
2,m2m100_418m_finetuned,0.94,0.2,1.0,0.77,0.88,0.95,0.99,1.0
1,m2m100_418m_base,0.92,0.2,1.0,0.74,0.85,0.93,0.98,1.0
0,best_results,0.94,0.2,1.0,0.77,0.87,0.95,0.99,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.02,-0.51,0.34,-0.04,-0.01,0.02,0.07,0.15
6,opus_mt_base,0.03,-0.17,0.34,-0.05,-0.01,0.02,0.08,0.16
5,nllb_3b_base_researchonly,0.03,-0.8,0.34,-0.08,-0.01,0.02,0.08,0.16
4,mbart50_mmt_finetuned,0.03,-0.54,0.34,-0.05,-0.01,0.02,0.07,0.15
3,mbart50_mmt_base,0.03,-0.71,0.33,-0.05,-0.01,0.02,0.08,0.16
2,m2m100_418m_finetuned,0.03,-0.51,0.33,-0.05,-0.01,0.02,0.07,0.15
1,m2m100_418m_base,0.03,-0.74,0.33,-0.08,-0.02,0.02,0.08,0.16
0,best_results,0.04,-0.13,0.34,-0.01,0.0,0.03,0.09,0.17


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.95,0.22,1.0,0.79,0.89,0.96,1.0,1.0
6,opus_mt_base,0.94,0.61,1.0,0.78,0.88,0.95,0.99,1.0
5,nllb_3b_base_researchonly,0.93,0.15,1.0,0.76,0.87,0.95,0.99,1.0
4,mbart50_mmt_finetuned,0.95,0.22,1.0,0.79,0.89,0.96,0.99,1.0
3,mbart50_mmt_base,0.93,0.25,1.0,0.77,0.87,0.94,0.99,1.0
2,m2m100_418m_finetuned,0.94,0.1,1.0,0.78,0.88,0.95,0.99,1.0
1,m2m100_418m_base,0.92,0.25,1.0,0.74,0.85,0.93,0.98,1.0
0,best_results,0.94,0.6,1.0,0.78,0.87,0.95,0.99,1.0


### Difference Versus Best Result Mixture of Experts

In [13]:
display_stats(df_test_no_rules, heading="\nDifference Versus Best Results - Test Data - no find and replace\n", compare_to_column="best_results")
display_stats(df_test_rules, heading="\nDifference Versus Best Results - Test Data - with preferential translations\n", compare_to_column="best_results")
display_stats(df_test_rules_v2, heading="\nDifference Versus Best Results - Test Data - with preferential translations - finetuned for find and replace\n", compare_to_column="best_results")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,-0.03,-0.59,0.39,-0.18,-0.07,-0.02,0.0,0.0
6,opus_mt_base,-0.02,-0.35,0.39,-0.13,-0.06,-0.01,0.0,0.0
5,nllb_3b_base_researchonly,-0.03,-1.0,0.39,-0.24,-0.06,-0.01,0.0,0.03
4,mbart50_mmt_finetuned,-0.03,-0.59,0.39,-0.2,-0.07,-0.02,0.0,0.0
3,mbart50_mmt_base,-0.02,-0.76,0.39,-0.13,-0.06,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.03,-0.71,0.39,-0.19,-0.07,-0.02,0.0,0.0
1,m2m100_418m_base,-0.02,-0.85,0.39,-0.16,-0.05,-0.01,0.0,0.0
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.01,-0.77,0.47,-0.09,-0.02,0.01,0.06,0.15
6,opus_mt_base,0.01,-0.44,0.27,-0.08,-0.02,0.0,0.05,0.12
5,nllb_3b_base_researchonly,-0.0,-1.02,0.28,-0.21,-0.04,0.0,0.04,0.1
4,mbart50_mmt_finetuned,0.01,-0.76,0.4,-0.1,-0.02,0.0,0.06,0.15
3,mbart50_mmt_base,0.0,-0.61,0.27,-0.08,-0.03,0.0,0.04,0.1
2,m2m100_418m_finetuned,0.01,-0.85,0.47,-0.12,-0.03,0.0,0.05,0.14
1,m2m100_418m_base,-0.02,-0.84,0.12,-0.19,-0.06,-0.0,0.0,0.04
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,-0.03,-0.75,0.06,-0.17,-0.07,-0.02,0.0,0.0
6,opus_mt_base,-0.02,-0.39,0.1,-0.13,-0.06,-0.01,0.0,0.0
5,nllb_3b_base_researchonly,-0.02,-1.02,0.12,-0.2,-0.06,-0.01,0.0,0.03
4,mbart50_mmt_finetuned,-0.03,-0.76,0.09,-0.18,-0.06,-0.02,0.0,0.0
3,mbart50_mmt_base,-0.02,-0.37,0.1,-0.12,-0.06,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.03,-0.84,0.1,-0.18,-0.07,-0.02,0.0,0.0
1,m2m100_418m_base,-0.02,-0.88,0.09,-0.13,-0.04,-0.01,0.0,0.0
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.01,-0.78,0.41,-0.07,-0.02,0.0,0.06,0.14
6,opus_mt_base,0.01,-0.35,0.27,-0.08,-0.02,0.0,0.04,0.11
5,nllb_3b_base_researchonly,-0.0,-0.92,0.25,-0.17,-0.03,0.0,0.04,0.09
4,mbart50_mmt_finetuned,0.01,-0.89,0.41,-0.08,-0.02,0.0,0.05,0.13
3,mbart50_mmt_base,0.0,-0.45,0.22,-0.09,-0.03,0.0,0.03,0.09
2,m2m100_418m_finetuned,0.0,-0.89,0.39,-0.11,-0.03,0.0,0.04,0.12
1,m2m100_418m_base,-0.02,-0.91,0.15,-0.17,-0.06,-0.0,0.0,0.04
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,-0.03,-0.62,0.01,-0.19,-0.07,-0.02,0.0,0.0
6,opus_mt_base,-0.02,-0.42,0.04,-0.13,-0.06,-0.01,0.0,0.0
5,nllb_3b_base_researchonly,-0.03,-1.02,0.1,-0.21,-0.06,-0.01,0.0,0.03
4,mbart50_mmt_finetuned,-0.03,-0.62,0.01,-0.19,-0.07,-0.02,0.0,0.0
3,mbart50_mmt_base,-0.02,-0.57,0.04,-0.12,-0.06,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.03,-0.71,0.0,-0.21,-0.06,-0.02,0.0,0.0
1,m2m100_418m_base,-0.02,-0.91,0.09,-0.16,-0.04,-0.01,0.0,0.0
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.01,-0.79,0.39,-0.08,-0.02,0.0,0.06,0.15
6,opus_mt_base,0.01,-0.4,0.31,-0.08,-0.02,0.0,0.04,0.11
5,nllb_3b_base_researchonly,-0.0,-1.02,0.25,-0.19,-0.03,0.0,0.04,0.1
4,mbart50_mmt_finetuned,0.01,-0.79,0.39,-0.08,-0.02,0.0,0.05,0.14
3,mbart50_mmt_base,0.0,-0.76,0.23,-0.08,-0.03,0.0,0.03,0.09
2,m2m100_418m_finetuned,0.0,-0.84,0.34,-0.11,-0.03,0.0,0.05,0.13
1,m2m100_418m_base,-0.02,-0.89,0.13,-0.18,-0.06,0.0,0.01,0.04
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
display_stats(df_train_no_rules, heading="\nDifference Versus Best Results - Training Data - no find and replace\n", compare_to_column="best_results")
display_stats(df_train_rules, heading="\nDifference Versus Best Results - Training Data - with preferential translations\n", compare_to_column="best_results")
display_stats(df_train_rules_v2, heading="\nDifference Versus Best Results - Training Data - with preferential translations - finetuned for find and replace\n", compare_to_column="best_results")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,-0.02,-0.72,0.07,-0.1,-0.05,-0.01,0.0,0.0
6,opus_mt_base,-0.01,-0.38,0.08,-0.09,-0.04,-0.01,0.0,0.0
5,nllb_3b_base_researchonly,-0.02,-0.86,0.09,-0.13,-0.04,-0.01,0.0,0.02
4,mbart50_mmt_finetuned,-0.02,-0.72,0.07,-0.11,-0.05,-0.01,0.0,0.0
3,mbart50_mmt_base,-0.01,-0.63,0.08,-0.09,-0.04,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.02,-0.74,0.07,-0.1,-0.04,-0.01,0.0,0.0
1,m2m100_418m_base,-0.02,-0.66,0.08,-0.12,-0.04,-0.01,0.0,0.0
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.01,-0.71,0.29,-0.05,-0.01,0.01,0.05,0.12
6,opus_mt_base,0.01,-0.32,0.25,-0.07,-0.02,0.0,0.04,0.1
5,nllb_3b_base_researchonly,-0.0,-0.82,0.21,-0.12,-0.03,0.0,0.03,0.08
4,mbart50_mmt_finetuned,0.01,-0.71,0.29,-0.06,-0.01,0.0,0.05,0.12
3,mbart50_mmt_base,-0.0,-0.76,0.23,-0.08,-0.03,0.0,0.02,0.08
2,m2m100_418m_finetuned,0.01,-0.71,0.29,-0.07,-0.02,0.0,0.04,0.11
1,m2m100_418m_base,-0.02,-0.62,0.13,-0.14,-0.05,-0.01,0.0,0.02
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,-0.02,-0.38,0.07,-0.1,-0.04,-0.01,0.0,0.0
6,opus_mt_base,-0.01,-0.29,0.06,-0.09,-0.04,-0.01,0.0,0.0
5,nllb_3b_base_researchonly,-0.02,-0.86,0.08,-0.13,-0.04,-0.01,0.0,0.02
4,mbart50_mmt_finetuned,-0.02,-0.47,0.07,-0.1,-0.04,-0.01,0.0,0.0
3,mbart50_mmt_base,-0.01,-0.74,0.08,-0.09,-0.04,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.02,-0.74,0.07,-0.1,-0.04,-0.01,0.0,0.0
1,m2m100_418m_base,-0.01,-0.55,0.07,-0.11,-0.04,-0.01,0.0,0.0
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.01,-0.45,0.25,-0.05,-0.01,0.0,0.05,0.11
6,opus_mt_base,0.0,-0.32,0.21,-0.07,-0.02,0.0,0.03,0.09
5,nllb_3b_base_researchonly,-0.0,-0.82,0.16,-0.12,-0.03,0.0,0.03,0.07
4,mbart50_mmt_finetuned,0.01,-0.58,0.25,-0.06,-0.01,0.0,0.04,0.11
3,mbart50_mmt_base,-0.0,-0.76,0.19,-0.08,-0.03,0.0,0.02,0.07
2,m2m100_418m_finetuned,0.0,-0.69,0.25,-0.07,-0.02,0.0,0.04,0.1
1,m2m100_418m_base,-0.02,-0.61,0.14,-0.13,-0.05,-0.01,0.0,0.03
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,-0.02,-0.63,0.07,-0.11,-0.04,-0.01,0.0,0.0
6,opus_mt_base,-0.01,-0.25,0.07,-0.09,-0.04,-0.01,0.0,0.0
5,nllb_3b_base_researchonly,-0.02,-0.81,0.08,-0.12,-0.04,-0.01,0.0,0.02
4,mbart50_mmt_finetuned,-0.02,-0.63,0.07,-0.11,-0.04,-0.01,0.0,0.0
3,mbart50_mmt_base,-0.01,-0.74,0.09,-0.09,-0.04,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.02,-0.63,0.07,-0.1,-0.04,-0.01,0.0,0.0
1,m2m100_418m_base,-0.01,-0.75,0.08,-0.11,-0.04,-0.01,0.0,0.0
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.01,-0.64,0.23,-0.05,-0.01,0.0,0.05,0.12
6,opus_mt_base,0.0,-0.23,0.23,-0.07,-0.02,0.0,0.03,0.09
5,nllb_3b_base_researchonly,-0.0,-0.83,0.22,-0.12,-0.03,0.0,0.03,0.07
4,mbart50_mmt_finetuned,0.01,-0.64,0.23,-0.06,-0.01,0.0,0.04,0.12
3,mbart50_mmt_base,-0.0,-0.7,0.23,-0.08,-0.03,0.0,0.02,0.07
2,m2m100_418m_finetuned,0.01,-0.71,0.25,-0.07,-0.02,0.0,0.04,0.1
1,m2m100_418m_base,-0.02,-0.73,0.22,-0.13,-0.05,-0.01,0.0,0.03
0,best_results,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Survey Results

In [21]:
def transform_categorical_counts(df, columns_to_drop=None):
    if columns_to_drop:
        df = df.drop(columns_to_drop, axis=1)
    
    all_values = set()
    for col in df.columns:
        unique_vals = df[col].dropna().unique()
        all_values.update(unique_vals)
    
    all_values = sorted(list(all_values))
    
    result_data = {}
    
    for value in all_values:
        result_data[value] = [(df[col] == value).sum() for col in df.columns]
    
    result_df = pd.DataFrame(result_data, index=df.columns)
    
    return result_df


def results_summary(df):
    result_weighting = {'bad': -2, 'good': 1, 'best': 2, 'worse': -1, 'better': 1}
    
    df = pd.DataFrame(df.apply(lambda row: sum(row.get(cat, 0) * weight for cat, weight in result_weighting.items()), axis=1))
    df.columns = ['Score']
    return df.sort_values('Score', ascending=False)


In [22]:
df_survey = pd.read_csv('translation_quality_results.csv')
df_survey_results = transform_categorical_counts(df_survey, ['source', 'corpus_type'])
display(df_survey_results)
display(results_summary(df_survey_results))

Unnamed: 0,bad,best,better,good,worse
translation_bureau,6,2,0,4,0
m2m100_418m_base,7,0,0,1,0
m2m100_418m_finetuned,1,1,1,3,0
mbart50_mmt_base,2,0,0,1,1
mbart50_mmt_finetuned,2,3,0,1,0
nllb_3b_base_researchonly,2,3,0,2,0
opus_mt_base,4,0,1,1,1
opus_mt_finetuned,1,1,0,1,0


Unnamed: 0,Score
m2m100_418m_finetuned,4
nllb_3b_base_researchonly,4
mbart50_mmt_finetuned,3
opus_mt_finetuned,1
translation_bureau,-4
mbart50_mmt_base,-4
opus_mt_base,-7
m2m100_418m_base,-13
