In [1]:
import pandas as pd
from IPython.display import display, HTML

pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
df_test_no_rules = pd.concat(
    [
        pd.read_csv("translation_results/translation_comparison_test_no_rules_20250911-1018.csv"),
        pd.read_csv("translation_results/translation_comparison_test_no_rules_20250912-1436.csv"),
    ], 
    axis=0
)

df_test_rules = pd.concat(
    [
        pd.read_csv("translation_results/translation_comparison_test_rules_20250912-0111.csv"),
        pd.read_csv("translation_results/translation_comparison_test_rules_20250913-0546.csv"),
    ], 
    axis=0
)

df_train_no_rules = pd.concat(
    [
        pd.read_csv("translation_results/translation_comparison_train_no_rules_20250911-1813.csv"),
        pd.read_csv("translation_results/translation_comparison_train_no_rules_20250912-2241.csv"),
    ], 
    axis=0
)

df_train_rules = pd.concat(
    [
        pd.read_csv("translation_results/translation_comparison_train_rules_20250912-0654.csv"),
        pd.read_csv("translation_results/translation_comparison_train_rules_20250913-1136.csv"),
    ], 
    axis=0
)



In [14]:
df_test_no_rules.sample().T

Unnamed: 0,46116
source,Such detectors are particularly relevant for D...
target,Ces détecteurs sont particulièrement pertinent...
source_lang,en
other_lang,fr
translator_name,m2m100_418m_finetuned
translated_text,De tels détecteurs sont particulièrement perti...
cosine_similarity_original_translation,0.84
cosine_similarity_vs_source,0.87
cosine_similarity_vs_target,0.92


In [3]:
df_test_no_rules['translator_name'].value_counts()

translator_name
best_model                   34384
mbart50_mmt_finetuned        19268
m2m100_418m_finetuned        18620
mbart50_mmt_base             18488
opus_mt_finetuned            17747
m2m100_418m_base             15851
opus_mt_base                 15642
nllb_3b_base_researchonly    10000
Name: count, dtype: int64

In [None]:
df_test_rules_error_data = pd.read_json("translation_results/translation_errors_test_rules_20250913-0546.json").T
df_train_rules_error_data = pd.read_json("translation_results/translation_errors_train_rules_20250913-1136.json").T

df_temp = df_test_rules_error_data.reset_index(names=['sample_n'])['sample_n'].str.split("_", n=1, expand=True)
df_test_rules_error_data = df_test_rules_error_data.reset_index(drop=True)
df_test_rules_error_data[['sample_n', 'model_name']] = df_temp 

df_temp = df_train_rules_error_data.reset_index(names=['sample_n'])['sample_n'].str.split("_", n=1, expand=True)
df_train_rules_error_data = df_train_rules_error_data.reset_index(drop=True)
df_train_rules_error_data[['sample_n', 'model_name']] = df_temp 

In [4]:
def display_stats(dataframe, dataframe2=None, order_by="translator_name", heading=None, compare_to_column=None):
    df = dataframe.copy()
    df.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
    df['quality_vs_tb'] = df['cosine_similarity_vs_source'] - df['cosine_similarity_original_translation']
    
    if dataframe2 is not None:
        df2 = dataframe2.copy()
        df2.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
        df2['quality_vs_tb'] = df2['cosine_similarity_vs_source'] - df2['cosine_similarity_original_translation']
    
    def make_quantile(q):
        return lambda x: x.quantile(q)
    
    percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
    agg_funcs = ['mean', 'min', 'max'] + [make_quantile(q) for q in percentiles]
    agg_names = ['mean', 'min', 'max'] + [f"{int(q*100)}%" for q in percentiles]
    
    quality_stats = df.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    )
    
    similarity_stats = df.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    )
    
    if dataframe2 is not None and compare_to_column:
        raise ValueError('try again. please only choose comparison vs column or dataframe')
        
    if heading:
        display(HTML(f"<h3>{heading}</h3>"))
        
    if dataframe2 is not None:
        quality_stats2 = df2.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        similarity_stats2 = df2.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        
        quality_diff = quality_stats.merge(quality_stats2, on='translator_name', suffixes=('', '_2'))
        for col in agg_names:
            quality_diff[col] = quality_diff[col] - quality_diff[col + '_2']
        quality_diff = quality_diff[['translator_name'] + agg_names]
        
        similarity_diff = similarity_stats.merge(similarity_stats2, on='translator_name', suffixes=('', '_2'))
        for col in agg_names:
            similarity_diff[col] = similarity_diff[col] - similarity_diff[col + '_2']
        similarity_diff = similarity_diff[['translator_name'] + agg_names]
        
        display(HTML("<h4>Difference in Quality Versus Translation Bureau</h4>"))
        display(quality_diff.sort_values(order_by, ascending=False))
        
        display(HTML("<h4>Difference in Similarity to Translation Bureau Translation</h4>"))
        display(similarity_diff.sort_values(order_by, ascending=False))
        
    else:
        if compare_to_column:
            if compare_to_column not in df['translator_name'].values:
                raise ValueError(f"compare_to_column '{compare_to_column}' not found in translator_name column")
        
            # Take the mean of comparison column values for each source to handle duplicates
            comparison_df = df[df['translator_name'] == compare_to_column][['source', 'quality_vs_tb', 'similarity_to_old_translation']].copy()
            comparison_df = comparison_df.groupby('source').agg({
                'quality_vs_tb': 'mean',
                'similarity_to_old_translation': 'mean'
            }).reset_index()
            comparison_df = comparison_df.rename(columns={
                'quality_vs_tb': 'quality_vs_tb_comparison',
                'similarity_to_old_translation': 'similarity_to_old_translation_comparison'
            })
            
            df_with_comparison = df.merge(comparison_df, on='source', how='left')
            
            df_with_comparison['quality_diff'] = df_with_comparison['quality_vs_tb'] - df_with_comparison['quality_vs_tb_comparison']
            df_with_comparison['similarity_diff'] = df_with_comparison['similarity_to_old_translation'] - df_with_comparison['similarity_to_old_translation_comparison']
            
            quality_diff = df_with_comparison.groupby('translator_name')['quality_diff'].agg(agg_funcs).reset_index().set_axis(
                ['translator_name'] + agg_names, axis=1
            )
            
            similarity_diff = df_with_comparison.groupby('translator_name')['similarity_diff'].agg(agg_funcs).reset_index().set_axis(
                ['translator_name'] + agg_names, axis=1
            )
            
            display(HTML(f"<h4>Difference in Quality Versus {compare_to_column}</h4>"))
            display(quality_diff.sort_values(order_by, ascending=False))
            
            display(HTML(f"<h4>Difference in Similarity to Translation Bureau Translation Versus {compare_to_column}</h4>"))
            display(similarity_diff.sort_values(order_by, ascending=False))
            
        else:
            display(HTML("<h4>Quality of Translations Versus Translation Bureau</h4>"))
            display(quality_stats.sort_values(order_by, ascending=False))
            
            display(HTML("<h4>Similarity to Translation Bureau Translation</h4>"))
            display(similarity_stats.sort_values(order_by, ascending=False))

# compare find and replace errors between models

In [5]:
display(HTML(f"<h3>Number of Missed Find and Replace Per 10_000 Samples: Test Dataset</h3>"))
display(pd.DataFrame(df_test_rules_error_data['model_name'].value_counts()))
display(pd.DataFrame(df_test_rules_error_data['sample_n'].value_counts().value_counts().sort_index()))
display(HTML(f"<h3>Number of Missed Find and Replace Per 10_000 Samples: Training Dataset</h3>"))
display(pd.DataFrame(df_train_rules_error_data['model_name'].value_counts()))
display(pd.DataFrame(df_train_rules_error_data['sample_n'].value_counts().value_counts().sort_index()))

Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned_100k,382
m2m100_418m_finetuned_25k,258
opus_mt_finetuned_25k,204
opus_mt_finetuned_100k,177
mbart50_mmt_finetuned_25k,162
mbart50_mmt_finetuned_100k,144


Unnamed: 0_level_0,count
count,Unnamed: 1_level_1
1,299
2,207
3,60
4,44
5,12
6,33


Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned_100k,313
m2m100_418m_finetuned_25k,216
opus_mt_finetuned_100k,130
opus_mt_finetuned_25k,121
mbart50_mmt_finetuned_25k,118
mbart50_mmt_finetuned_100k,110


Unnamed: 0_level_0,count
count,Unnamed: 1_level_1
1,239
2,166
3,43
4,31
5,20
6,14


# compare translation quality under different conditions

In [6]:
display_stats(df_test_no_rules, heading="\nTest Data - no find and replace\n")
display_stats(df_test_rules, heading="\nTest Data - with preferential translations\n")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.04,-0.55,0.62,-0.08,-0.02,0.03,0.13,0.34
6,opus_mt_base,0.05,-0.35,0.67,-0.08,-0.03,0.03,0.14,0.35
5,nllb_3b_base_researchonly,0.05,-0.95,0.67,-0.15,-0.02,0.04,0.15,0.34
4,mbart50_mmt_finetuned,0.04,-0.48,0.64,-0.08,-0.02,0.03,0.13,0.34
3,mbart50_mmt_base,0.05,-0.62,0.67,-0.08,-0.02,0.03,0.15,0.35
2,m2m100_418m_finetuned,0.04,-0.65,0.65,-0.09,-0.02,0.03,0.13,0.34
1,m2m100_418m_base,0.05,-0.89,0.68,-0.12,-0.02,0.04,0.15,0.35
0,best_model,0.07,-0.95,0.68,-0.06,-0.0,0.05,0.16,0.38


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.9,0.04,1.0,0.51,0.8,0.93,0.98,1.0
6,opus_mt_base,0.9,0.2,1.0,0.58,0.79,0.92,0.97,1.0
5,nllb_3b_base_researchonly,0.89,-0.05,1.0,0.47,0.78,0.91,0.97,1.0
4,mbart50_mmt_finetuned,0.9,0.08,1.0,0.5,0.8,0.93,0.98,1.0
3,mbart50_mmt_base,0.89,0.21,1.0,0.56,0.78,0.91,0.97,1.0
2,m2m100_418m_finetuned,0.89,0.06,1.0,0.5,0.79,0.92,0.98,1.0
1,m2m100_418m_base,0.87,0.07,1.0,0.49,0.76,0.89,0.96,0.99
0,best_model,0.89,-0.05,1.0,0.52,0.78,0.91,0.97,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned_25k,0.04,-0.55,0.67,-0.08,-0.02,0.03,0.13,0.34
5,opus_mt_finetuned_100k,0.04,-0.55,0.67,-0.11,-0.03,0.03,0.13,0.33
4,mbart50_mmt_finetuned_25k,0.04,-0.53,0.68,-0.09,-0.03,0.03,0.13,0.33
3,mbart50_mmt_finetuned_100k,0.04,-0.48,0.68,-0.12,-0.03,0.03,0.13,0.33
2,m2m100_418m_finetuned_25k,0.04,-0.77,0.64,-0.15,-0.03,0.02,0.13,0.34
1,m2m100_418m_finetuned_100k,0.04,-0.67,0.65,-0.17,-0.04,0.02,0.13,0.34
0,best_model,0.05,-0.55,0.68,-0.06,-0.01,0.04,0.14,0.36


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned_25k,0.9,0.06,1.0,0.51,0.79,0.93,0.98,1.0
5,opus_mt_finetuned_100k,0.9,0.07,1.0,0.52,0.79,0.93,0.98,1.0
4,mbart50_mmt_finetuned_25k,0.9,0.07,1.0,0.51,0.79,0.92,0.98,1.0
3,mbart50_mmt_finetuned_100k,0.9,0.07,1.0,0.5,0.79,0.92,0.98,1.0
2,m2m100_418m_finetuned_25k,0.89,0.07,1.0,0.48,0.78,0.91,0.97,1.0
1,m2m100_418m_finetuned_100k,0.88,0.07,1.0,0.5,0.77,0.91,0.97,1.0
0,best_model,0.9,0.06,1.0,0.54,0.79,0.92,0.98,1.0


In [7]:
display_stats(df_train_no_rules, heading="\nTraining Data - no find and replace\n")
display_stats(df_train_rules, heading="\nTraining Data - with preferential translations\n")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.02,-0.58,0.43,-0.04,-0.01,0.01,0.07,0.16
6,opus_mt_base,0.03,-0.21,0.42,-0.05,-0.01,0.02,0.08,0.16
5,nllb_3b_base_researchonly,0.03,-0.79,0.41,-0.09,-0.01,0.02,0.08,0.17
4,mbart50_mmt_finetuned,0.02,-0.58,0.43,-0.05,-0.01,0.01,0.07,0.16
3,mbart50_mmt_base,0.03,-0.59,0.41,-0.05,-0.01,0.02,0.08,0.17
2,m2m100_418m_finetuned,0.02,-0.73,0.38,-0.05,-0.01,0.02,0.07,0.16
1,m2m100_418m_base,0.02,-0.56,0.42,-0.09,-0.02,0.02,0.08,0.16
0,best_model,0.04,-0.79,0.43,-0.03,0.0,0.03,0.09,0.18


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.95,0.04,1.0,0.78,0.89,0.96,1.0,1.0
6,opus_mt_base,0.94,0.4,1.0,0.77,0.88,0.95,0.99,1.0
5,nllb_3b_base_researchonly,0.93,0.07,1.0,0.74,0.87,0.95,0.99,1.0
4,mbart50_mmt_finetuned,0.95,0.04,1.0,0.78,0.89,0.96,0.99,1.0
3,mbart50_mmt_base,0.93,0.3,1.0,0.77,0.87,0.94,0.99,1.0
2,m2m100_418m_finetuned,0.94,0.04,1.0,0.77,0.88,0.95,0.99,1.0
1,m2m100_418m_base,0.92,0.39,1.0,0.72,0.85,0.93,0.98,1.0
0,best_model,0.94,0.07,1.0,0.76,0.87,0.95,0.99,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned_25k,0.02,-0.58,0.35,-0.07,-0.01,0.02,0.07,0.16
5,opus_mt_finetuned_100k,0.02,-0.58,0.42,-0.1,-0.01,0.01,0.07,0.16
4,mbart50_mmt_finetuned_25k,0.02,-0.58,0.43,-0.07,-0.01,0.01,0.07,0.16
3,mbart50_mmt_finetuned_100k,0.02,-0.58,0.44,-0.11,-0.01,0.01,0.07,0.16
2,m2m100_418m_finetuned_25k,0.02,-0.82,0.38,-0.14,-0.02,0.01,0.07,0.16
1,m2m100_418m_finetuned_100k,0.01,-0.71,0.38,-0.16,-0.03,0.01,0.07,0.16
0,best_model,0.03,-0.58,0.44,-0.04,-0.0,0.02,0.08,0.17


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned_25k,0.95,0.04,1.0,0.76,0.89,0.96,0.99,1.0
5,opus_mt_finetuned_100k,0.95,0.04,1.0,0.75,0.88,0.96,1.0,1.0
4,mbart50_mmt_finetuned_25k,0.94,0.04,1.0,0.76,0.88,0.96,0.99,1.0
3,mbart50_mmt_finetuned_100k,0.94,0.04,1.0,0.75,0.87,0.96,0.99,1.0
2,m2m100_418m_finetuned_25k,0.94,0.04,1.0,0.73,0.87,0.95,0.99,1.0
1,m2m100_418m_finetuned_100k,0.93,0.04,1.0,0.72,0.85,0.95,0.99,1.0
0,best_model,0.94,0.04,1.0,0.77,0.88,0.96,0.99,1.0


# comparison of finetuned models with and without find and replace

In [8]:
base_translators = ['opus_mt_base', 'mbart50_mmt_base', 'm2m100_418m_base', 'nllb_3b_base_researchonly']
finetuned_translators = ['opus_mt_finetuned', 'mbart50_mmt_finetuned', 'm2m100_418m_finetuned']
finetuned_translators_25k = ['opus_mt_finetuned_25k', 'mbart50_mmt_finetuned_25k', 'm2m100_418m_finetuned_25k']
finetuned_translators_100k = ['opus_mt_finetuned_100k', 'mbart50_mmt_finetuned_100k', 'm2m100_418m_finetuned_100k']

df_test_ft = df_test_no_rules[df_test_no_rules.translator_name.isin(finetuned_translators)].copy()
df_test_25k = df_test_rules[df_test_rules.translator_name.isin(finetuned_translators_25k)].copy()
df_test_25k['translator_name'] = df_test_25k['translator_name'].str.replace('_25k', '', regex=False)
df_test_100k = df_test_rules[df_test_rules.translator_name.isin(finetuned_translators_100k)].copy()
df_test_100k['translator_name'] = df_test_100k['translator_name'].str.replace('_100k', '', regex=False)

display_stats(
    df_test_25k,
    df_test_ft,
    heading="\nTest Data - find and replace 25k replace minus no find and replace\n"
)

display_stats(
    df_test_100k,
    df_test_ft,
    heading="\nTest Data - find and replace 100k replace minus no find and replace\n"
)

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.0,0.0,0.05,-0.01,-0.0,0.0,0.0,-0.01
1,mbart50_mmt_finetuned,-0.0,-0.05,0.04,-0.01,-0.0,-0.0,-0.0,-0.01
0,m2m100_418m_finetuned,-0.01,-0.12,-0.01,-0.05,-0.01,-0.0,-0.0,0.01


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.03,0.0,-0.0,-0.01,-0.0,-0.0,-0.0
1,mbart50_mmt_finetuned,-0.0,-0.01,0.0,0.01,-0.01,-0.0,-0.0,-0.0
0,m2m100_418m_finetuned,-0.01,0.01,0.0,-0.03,-0.01,-0.01,-0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.0,0.05,-0.04,-0.0,0.0,-0.0,-0.01
1,mbart50_mmt_finetuned,-0.0,0.0,0.04,-0.04,-0.01,-0.0,-0.0,-0.01
0,m2m100_418m_finetuned,-0.01,-0.02,0.0,-0.07,-0.02,-0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.03,0.0,0.01,-0.01,-0.0,-0.0,-0.0
1,mbart50_mmt_finetuned,-0.01,-0.01,0.0,-0.0,-0.01,-0.01,-0.0,-0.0
0,m2m100_418m_finetuned,-0.01,0.01,0.0,-0.0,-0.02,-0.01,-0.0,-0.0


In [9]:
df_train_ft = df_train_no_rules[df_train_no_rules.translator_name.isin(finetuned_translators)].copy()
df_train_25k = df_train_rules[df_train_rules.translator_name.isin(finetuned_translators_25k)].copy()
df_train_25k['translator_name'] = df_train_25k['translator_name'].str.replace('_25k', '', regex=False)
df_train_100k = df_train_rules[df_train_rules.translator_name.isin(finetuned_translators_100k)].copy()
df_train_100k['translator_name'] = df_train_100k['translator_name'].str.replace('_100k', '', regex=False)

display_stats(
    df_train_25k,
    df_train_ft,
    heading="\nTraining Data - find and replace 25k replace minus no find and replace\n"
)

display_stats(
    df_train_100k,
    df_train_ft,
    heading="\nTraining Data - find and replace 100k replace minus no find and replace\n"
)

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,0.0,0.0,-0.08,-0.02,-0.0,0.0,0.0,-0.0
1,mbart50_mmt_finetuned,-0.0,0.0,0.0,-0.02,-0.0,-0.0,0.0,-0.0
0,m2m100_418m_finetuned,-0.01,-0.09,0.0,-0.08,-0.01,-0.0,-0.0,-0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.0,0.0,-0.02,-0.01,-0.0,-0.0,0.0
1,mbart50_mmt_finetuned,-0.0,0.0,0.0,-0.02,-0.01,-0.0,-0.0,0.0
0,m2m100_418m_finetuned,-0.01,0.0,0.0,-0.04,-0.01,-0.0,-0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.0,0.0,-0.01,-0.06,-0.0,-0.0,0.0,-0.0
1,mbart50_mmt_finetuned,-0.0,0.0,0.01,-0.07,-0.01,-0.0,0.0,0.0
0,m2m100_418m_finetuned,-0.01,0.02,0.0,-0.11,-0.02,-0.0,-0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,opus_mt_finetuned,-0.01,0.0,0.0,-0.03,-0.01,-0.0,-0.0,0.0
1,mbart50_mmt_finetuned,-0.01,0.0,0.0,-0.03,-0.01,-0.0,-0.0,0.0
0,m2m100_418m_finetuned,-0.01,0.0,0.0,-0.05,-0.03,-0.01,-0.0,0.0


# Best Results Mixture of Experts Translation Model Proof of Concept 
### (if we deploy all models and take the best result, we can improve our results)

In [10]:
display_stats(df_test_no_rules, heading="\nDifference Versus Best Results - Test Data - no find and replace\n", compare_to_column="best_model")
display_stats(df_test_rules, heading="\nDifference Versus Best Results - Test Data - with preferential translations\n", compare_to_column="best_model")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,-0.03,-0.65,0.31,-0.19,-0.07,-0.02,0.0,0.03
6,opus_mt_base,-0.02,-0.37,0.31,-0.13,-0.06,-0.01,0.0,0.03
5,nllb_3b_base_researchonly,-0.02,-0.78,0.14,-0.17,-0.04,-0.01,0.0,0.0
4,mbart50_mmt_finetuned,-0.02,-0.51,0.31,-0.18,-0.06,-0.01,0.0,0.04
3,mbart50_mmt_base,-0.02,-0.62,0.31,-0.11,-0.05,-0.01,0.01,0.05
2,m2m100_418m_finetuned,-0.02,-0.68,0.31,-0.22,-0.06,-0.01,0.0,0.04
1,m2m100_418m_base,-0.02,-0.85,0.31,-0.15,-0.05,-0.01,0.01,0.05
0,best_model,-0.0,-0.78,0.31,-0.07,-0.02,0.0,0.02,0.06


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.01,-0.81,0.47,-0.09,-0.02,0.01,0.06,0.16
6,opus_mt_base,0.01,-0.34,0.32,-0.08,-0.02,0.0,0.04,0.11
5,nllb_3b_base_researchonly,-0.0,-0.71,0.19,-0.15,-0.02,0.0,0.02,0.06
4,mbart50_mmt_finetuned,0.01,-0.7,0.47,-0.1,-0.02,0.01,0.05,0.15
3,mbart50_mmt_base,0.0,-0.62,0.29,-0.09,-0.03,-0.0,0.03,0.09
2,m2m100_418m_finetuned,0.0,-0.79,0.41,-0.14,-0.03,0.0,0.05,0.14
1,m2m100_418m_base,-0.02,-0.84,0.26,-0.2,-0.07,-0.01,0.01,0.04
0,best_model,-0.0,-0.71,0.29,-0.06,-0.02,0.0,0.02,0.07


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned_25k,-0.01,-0.46,0.15,-0.09,-0.03,-0.0,0.0,0.0
5,opus_mt_finetuned_100k,-0.01,-0.53,0.15,-0.13,-0.03,-0.0,0.0,0.01
4,mbart50_mmt_finetuned_25k,-0.01,-0.55,0.31,-0.12,-0.04,-0.01,0.0,0.03
3,mbart50_mmt_finetuned_100k,-0.01,-0.5,0.31,-0.14,-0.04,-0.0,0.01,0.03
2,m2m100_418m_finetuned_25k,-0.02,-0.82,0.17,-0.18,-0.05,-0.01,0.01,0.03
1,m2m100_418m_finetuned_100k,-0.02,-0.66,0.17,-0.19,-0.06,-0.01,0.01,0.03
0,best_model,0.0,-0.46,0.31,-0.05,-0.01,0.0,0.01,0.04


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned_25k,0.0,-0.5,0.39,-0.07,-0.01,0.0,0.02,0.06
5,opus_mt_finetuned_100k,-0.0,-0.71,0.39,-0.12,-0.02,0.0,0.02,0.06
4,mbart50_mmt_finetuned_25k,-0.0,-0.57,0.42,-0.1,-0.03,-0.0,0.02,0.07
3,mbart50_mmt_finetuned_100k,-0.0,-0.52,0.42,-0.13,-0.03,-0.0,0.02,0.07
2,m2m100_418m_finetuned_25k,-0.01,-0.75,0.21,-0.18,-0.05,-0.0,0.01,0.05
1,m2m100_418m_finetuned_100k,-0.02,-0.82,0.21,-0.19,-0.06,-0.0,0.01,0.05
0,best_model,-0.0,-0.5,0.42,-0.05,-0.01,0.0,0.01,0.05


In [11]:
display_stats(df_train_no_rules, heading="\nDifference Versus Best Results - Training Data - no find and replace\n", compare_to_column="best_model")
display_stats(df_train_rules, heading="\nDifference Versus Best Results - Training Data - with preferential translations\n", compare_to_column="best_model")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,-0.02,-0.64,0.29,-0.11,-0.04,-0.01,0.0,0.02
6,opus_mt_base,-0.01,-0.27,0.3,-0.08,-0.04,-0.01,0.0,0.02
5,nllb_3b_base_researchonly,-0.01,-0.64,0.06,-0.09,-0.02,-0.0,0.0,0.0
4,mbart50_mmt_finetuned,-0.01,-0.64,0.3,-0.1,-0.04,-0.01,0.0,0.02
3,mbart50_mmt_base,-0.01,-0.62,0.26,-0.08,-0.03,-0.01,0.0,0.03
2,m2m100_418m_finetuned,-0.01,-0.83,0.26,-0.11,-0.04,-0.01,0.0,0.02
1,m2m100_418m_base,-0.01,-0.59,0.29,-0.11,-0.04,-0.01,0.01,0.03
0,best_model,0.0,-0.64,0.3,-0.05,-0.01,0.0,0.01,0.04


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
7,opus_mt_finetuned,0.02,-0.8,0.28,-0.05,-0.01,0.01,0.05,0.12
6,opus_mt_base,0.01,-0.22,0.28,-0.06,-0.02,0.0,0.03,0.09
5,nllb_3b_base_researchonly,-0.0,-0.6,0.16,-0.08,-0.02,0.0,0.02,0.05
4,mbart50_mmt_finetuned,0.01,-0.8,0.3,-0.06,-0.01,0.01,0.05,0.12
3,mbart50_mmt_base,-0.0,-0.62,0.28,-0.07,-0.03,-0.0,0.02,0.07
2,m2m100_418m_finetuned,0.01,-0.8,0.29,-0.07,-0.02,0.0,0.04,0.12
1,m2m100_418m_base,-0.02,-0.51,0.25,-0.14,-0.05,-0.01,0.0,0.03
0,best_model,-0.0,-0.6,0.28,-0.05,-0.01,0.0,0.01,0.05


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned_25k,-0.01,-0.45,0.08,-0.06,-0.02,-0.0,0.0,0.0
5,opus_mt_finetuned_100k,-0.01,-0.4,0.11,-0.11,-0.02,-0.0,0.0,0.01
4,mbart50_mmt_finetuned_25k,-0.01,-0.61,0.11,-0.08,-0.02,-0.0,0.0,0.02
3,mbart50_mmt_finetuned_100k,-0.01,-0.61,0.12,-0.12,-0.03,-0.0,0.0,0.02
2,m2m100_418m_finetuned_25k,-0.01,-0.82,0.09,-0.14,-0.03,-0.0,0.0,0.02
1,m2m100_418m_finetuned_100k,-0.02,-0.76,0.11,-0.17,-0.05,-0.0,0.0,0.02
0,best_model,-0.0,-0.45,0.12,-0.04,-0.01,0.0,0.01,0.03


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned_25k,0.0,-0.34,0.2,-0.05,-0.01,0.0,0.01,0.05
5,opus_mt_finetuned_100k,-0.0,-0.41,0.17,-0.11,-0.01,0.0,0.02,0.05
4,mbart50_mmt_finetuned_25k,-0.0,-0.75,0.28,-0.08,-0.02,0.0,0.01,0.05
3,mbart50_mmt_finetuned_100k,-0.0,-0.75,0.28,-0.12,-0.02,0.0,0.01,0.05
2,m2m100_418m_finetuned_25k,-0.01,-0.88,0.13,-0.15,-0.04,-0.0,0.01,0.04
1,m2m100_418m_finetuned_100k,-0.02,-0.74,0.18,-0.19,-0.06,-0.0,0.01,0.04
0,best_model,0.0,-0.34,0.2,-0.03,-0.01,0.0,0.01,0.04


# Survey Results

In [12]:
def transform_categorical_counts(df, columns_to_drop=None):
    if columns_to_drop:
        df = df.drop(columns_to_drop, axis=1)
    
    all_values = set()
    for col in df.columns:
        unique_vals = df[col].dropna().unique()
        all_values.update(unique_vals)
    
    all_values = sorted(list(all_values))
    
    result_data = {}
    
    for value in all_values:
        result_data[value] = [(df[col] == value).sum() for col in df.columns]
    
    result_df = pd.DataFrame(result_data, index=df.columns)
    
    return result_df


def results_summary(df):
    result_weighting = {'bad': -2, 'good': 1, 'best': 2, 'worse': -1, 'better': 1}
    
    df = pd.DataFrame(df.apply(lambda row: sum(row.get(cat, 0) * weight for cat, weight in result_weighting.items()), axis=1))
    df.columns = ['Score']
    return df.sort_values('Score', ascending=False)


In [13]:
df_survey = pd.read_csv('translation_quality_results.csv')
df_survey_results = transform_categorical_counts(df_survey, ['source', 'corpus_type'])
display(df_survey_results)
display(results_summary(df_survey_results))

Unnamed: 0,bad,best,better,good,worse
translation_bureau,6,2,0,4,0
m2m100_418m_base,7,0,0,1,0
m2m100_418m_finetuned,1,1,1,3,0
mbart50_mmt_base,2,0,0,1,1
mbart50_mmt_finetuned,2,3,0,1,0
nllb_3b_base_researchonly,2,3,0,2,0
opus_mt_base,4,0,1,1,1
opus_mt_finetuned,1,1,0,1,0


Unnamed: 0,Score
m2m100_418m_finetuned,4
nllb_3b_base_researchonly,4
mbart50_mmt_finetuned,3
opus_mt_finetuned,1
translation_bureau,-4
mbart50_mmt_base,-4
opus_mt_base,-7
m2m100_418m_base,-13
