In [1]:
import pandas as pd
from IPython.display import display, HTML

pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# data import

df_test_no_rules = pd.read_csv("translation_results/20250917_1717_translation_comparison_test_no_find_and_replace.csv")
df_test_rules = pd.read_csv("translation_results/20250917_1544_translation_comparison_test.csv")
df_train_no_rules = pd.read_csv("translation_results/20250917_1750_translation_comparison_train_no_find_and_replace.csv")
df_train_rules = pd.read_csv("translation_results/20250917_1636_translation_comparison_train.csv")


In [3]:
# errors data import

import json

def parse_error_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    rows = []
    
    if 'find_replace_error_details' in data:
        for key, error_info in data['find_replace_error_details'].items():
            model_name, sample_n = key.rsplit('_', 1)
            
            rows.append({
                'sample_n': sample_n,
                'model_name': model_name,
                'original_text': error_info.get('original_text', ''),
                'preprocessed_text': error_info.get('preprocessed_text', ''),
                'translated_with_tokens': error_info.get('translated_with_tokens', ''),
                'retry_attempts': error_info.get('retry_attempts', 0),
                'error_type': 'find_replace'
            })
    
    if 'extra_token_error_details' in data:
        for key, error_info in data['extra_token_error_details'].items():
            model_name, sample_n = key.rsplit('_', 1)
            
            rows.append({
                'sample_n': sample_n,
                'model_name': model_name,
                'original_text': error_info.get('original_text', ''),
                'preprocessed_text': error_info.get('preprocessed_text', ''),
                'translated_with_tokens': error_info.get('translated_with_tokens', ''),
                'retry_attempts': error_info.get('retry_attempts', 0),
                'error_type': 'extra_token'
            })
    
    df = pd.DataFrame(rows)
    
    # TODO: if we keep this error logging in prod, fix it to only log non-zero retry errors (0 retries is not an error)
    
    return df[df['retry_attempts'] != 0]

df_test_no_rules_error_data = parse_error_data("translation_results/20250917_1717_translation_errors_test_no_find_and_replace.json")
df_test_rules_error_data = parse_error_data("translation_results/20250917_1544_translation_errors_test.json")
df_train_no_rules_error_data = parse_error_data("translation_results/20250917_1750_translation_errors_train_no_find_and_replace.json")   
df_train_rules_error_data = parse_error_data("translation_results/20250917_1636_translation_errors_train.json")

# check data

In [4]:
# check the success-rate for each model (perfect)

display(pd.DataFrame(df_test_no_rules['translator_name'].value_counts()))
display(pd.DataFrame(df_test_rules['translator_name'].value_counts()))
display(pd.DataFrame(df_train_no_rules['translator_name'].value_counts()))
display(pd.DataFrame(df_train_rules['translator_name'].value_counts()))


Unnamed: 0_level_0,count
translator_name,Unnamed: 1_level_1
opus_mt_base,1000
opus_mt_finetuned,1000
m2m100_418m_base,1000
m2m100_418m_finetuned,1000
mbart50_mmt_base,1000
mbart50_mmt_finetuned,1000
best_model,1000


Unnamed: 0_level_0,count
translator_name,Unnamed: 1_level_1
opus_mt_base,1000
opus_mt_finetuned,1000
m2m100_418m_base,1000
m2m100_418m_finetuned,1000
mbart50_mmt_base,1000
mbart50_mmt_finetuned,1000
best_model,1000


Unnamed: 0_level_0,count
translator_name,Unnamed: 1_level_1
opus_mt_base,1000
opus_mt_finetuned,1000
m2m100_418m_base,1000
m2m100_418m_finetuned,1000
mbart50_mmt_base,1000
mbart50_mmt_finetuned,1000
best_model,1000


Unnamed: 0_level_0,count
translator_name,Unnamed: 1_level_1
opus_mt_base,1000
opus_mt_finetuned,1000
m2m100_418m_base,1000
m2m100_418m_finetuned,1000
mbart50_mmt_base,1000
mbart50_mmt_finetuned,1000
best_model,1000


# Errors: Missed Find and Replace

In [5]:
display(HTML(f"<h4>Most errors are not fixed by retry attempts</h4>"))
display(pd.DataFrame(df_test_no_rules_error_data['retry_attempts'].value_counts()).sort_index(ascending=False))

n_fixed = int(pd.DataFrame(df_test_no_rules_error_data[df_test_no_rules_error_data.retry_attempts != 9]['retry_attempts'].value_counts()).sum().values[0])
n_total = int(pd.DataFrame(df_test_no_rules_error_data['retry_attempts'].value_counts()).sum().values[0])
display(HTML(f"<h4>Number of errors fixed by re-attempted translations: {n_fixed} out of {n_total} ({n_fixed / n_total:0.1%})</h4>"))

Unnamed: 0_level_0,count
retry_attempts,Unnamed: 1_level_1
9,86
8,2
7,1
5,1
4,5
3,4
2,8
1,3


In [6]:
display(HTML(f"<h3>Number of Errors Per 1000 Samples: Test Dataset (Without Find-and-Replace)</h3>"))
display(HTML(f"<h4>Total Errors</h4>"))
display(pd.DataFrame(df_test_no_rules_error_data['model_name'].value_counts()))
display(HTML(f"<h4>Errors Per Sample</h4>"))
display(pd.DataFrame(df_test_no_rules_error_data['sample_n'].value_counts().value_counts().sort_index()).rename_axis('n_errors'))
display(HTML(f"<h4>Find-And-Replace Errors</h4>"))
display(pd.DataFrame(df_test_no_rules_error_data[df_test_no_rules_error_data.error_type=='find_replace']['model_name'].value_counts()))

display(HTML(f"<h3>Number of Errors Per 1000 Samples: Test Dataset</h3>"))
display(HTML(f"<h4>Total Errors</h4>"))
display(pd.DataFrame(df_test_rules_error_data['model_name'].value_counts()))
display(HTML(f"<h4>Errors Per Sample</h4>"))
display(pd.DataFrame(df_test_rules_error_data['sample_n'].value_counts().value_counts().sort_index()).rename_axis('n_errors'))
display(HTML(f"<h4>Find-And-Replace Errors</h4>"))
display(pd.DataFrame(df_test_rules_error_data[df_test_rules_error_data.error_type=='find_replace']['model_name'].value_counts()))

display(HTML(f"<h3>Number of Errors Per 1000 Samples: Training Dataset (Without Find-and-Replace)</h3>"))
display(HTML(f"<h4>Total Errors</h4>"))
display(pd.DataFrame(df_train_no_rules_error_data['model_name'].value_counts()))
display(HTML(f"<h4>Errors Per Sample</h4>"))
display(pd.DataFrame(df_train_no_rules_error_data['sample_n'].value_counts().value_counts().sort_index()).rename_axis('n_errors'))
display(HTML(f"<h4>Find-And-Replace Errors</h4>"))
display(pd.DataFrame(df_train_no_rules_error_data[df_train_no_rules_error_data.error_type=='find_replace']['model_name'].value_counts()))

display(HTML(f"<h3>Number of Errors Per 1000 Samples: Training Dataset</h3>"))
display(HTML(f"<h4>Total Errors</h4>"))
display(pd.DataFrame(df_train_rules_error_data['model_name'].value_counts()))
display(HTML(f"<h4>Errors Per Sample</h4>"))
display(pd.DataFrame(df_train_rules_error_data['sample_n'].value_counts().value_counts().sort_index()).rename_axis('n_errors'))
display(HTML(f"<h4>Find-And-Replace Errors</h4>"))
display(pd.DataFrame(df_train_rules_error_data[df_train_rules_error_data.error_type=='find_replace']['model_name'].value_counts()))

Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned,36
opus_mt_finetuned,27
mbart50_mmt_finetuned,19
opus_mt_base,13
m2m100_418m_base,8
mbart50_mmt_base,7


Unnamed: 0_level_0,count
n_errors,Unnamed: 1_level_1
1,50
2,11
3,10
4,2


Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned,36
opus_mt_finetuned,27
mbart50_mmt_finetuned,19
opus_mt_base,13
m2m100_418m_base,8
mbart50_mmt_base,7


Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned,28
opus_mt_finetuned,24
mbart50_mmt_finetuned,20
opus_mt_base,12
m2m100_418m_base,6
mbart50_mmt_base,5


Unnamed: 0_level_0,count
n_errors,Unnamed: 1_level_1
1,33
2,9
3,12
4,2


Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned,28
opus_mt_finetuned,24
mbart50_mmt_finetuned,20
opus_mt_base,12
m2m100_418m_base,6
mbart50_mmt_base,5


Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned,36
opus_mt_finetuned,27
mbart50_mmt_finetuned,19
opus_mt_base,13
m2m100_418m_base,8
mbart50_mmt_base,7


Unnamed: 0_level_0,count
n_errors,Unnamed: 1_level_1
1,50
2,11
3,10
4,2


Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned,36
opus_mt_finetuned,27
mbart50_mmt_finetuned,19
opus_mt_base,13
m2m100_418m_base,8
mbart50_mmt_base,7


Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned,36
opus_mt_finetuned,27
mbart50_mmt_finetuned,19
opus_mt_base,13
m2m100_418m_base,8
mbart50_mmt_base,7


Unnamed: 0_level_0,count
n_errors,Unnamed: 1_level_1
1,50
2,11
3,10
4,2


Unnamed: 0_level_0,count
model_name,Unnamed: 1_level_1
m2m100_418m_finetuned,36
opus_mt_finetuned,27
mbart50_mmt_finetuned,19
opus_mt_base,13
m2m100_418m_base,8
mbart50_mmt_base,7


# compare translation quality under different conditions

In [7]:
def display_stats(dataframe, dataframe2=None, order_by="translator_name", heading=None, compare_to_column=None):
    df = dataframe.copy()
    df.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
    df['quality_vs_tb'] = df['cosine_similarity_vs_source'] - df['cosine_similarity_original_translation']
    
    if dataframe2 is not None:
        df2 = dataframe2.copy()
        df2.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
        df2['quality_vs_tb'] = df2['cosine_similarity_vs_source'] - df2['cosine_similarity_original_translation']
    
    def make_quantile(q):
        return lambda x: x.quantile(q)
    
    percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
    agg_funcs = ['mean', 'min', 'max'] + [make_quantile(q) for q in percentiles]
    agg_names = ['mean', 'min', 'max'] + [f"{int(q*100)}%" for q in percentiles]
    
    quality_stats = df.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    )
    
    similarity_stats = df.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    )
    
    if dataframe2 is not None and compare_to_column:
        raise ValueError('try again. please only choose comparison vs column or dataframe')
        
    if heading:
        display(HTML(f"<h3>{heading}</h3>"))
        
    if dataframe2 is not None:
        quality_stats2 = df2.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        similarity_stats2 = df2.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        
        quality_diff = quality_stats.merge(quality_stats2, on='translator_name', suffixes=('', '_2'))
        for col in agg_names:
            quality_diff[col] = quality_diff[col] - quality_diff[col + '_2']
        quality_diff = quality_diff[['translator_name'] + agg_names]
        
        similarity_diff = similarity_stats.merge(similarity_stats2, on='translator_name', suffixes=('', '_2'))
        for col in agg_names:
            similarity_diff[col] = similarity_diff[col] - similarity_diff[col + '_2']
        similarity_diff = similarity_diff[['translator_name'] + agg_names]
        
        display(HTML("<h4>Difference in Quality Versus Translation Bureau</h4>"))
        display(quality_diff.sort_values(order_by, ascending=False))
        
        display(HTML("<h4>Difference in Similarity to Translation Bureau Translation</h4>"))
        display(similarity_diff.sort_values(order_by, ascending=False))
        
    else:
        if compare_to_column:
            if compare_to_column not in df['translator_name'].values:
                raise ValueError(f"compare_to_column '{compare_to_column}' not found in translator_name column")
        
            comparison_df = df[df['translator_name'] == compare_to_column][['source', 'quality_vs_tb', 'similarity_to_old_translation']].copy()
            comparison_df = comparison_df.groupby('source').agg({
                'quality_vs_tb': 'mean',
                'similarity_to_old_translation': 'mean'
            }).reset_index()
            comparison_df = comparison_df.rename(columns={
                'quality_vs_tb': 'quality_vs_tb_comparison',
                'similarity_to_old_translation': 'similarity_to_old_translation_comparison'
            })
            
            df_with_comparison = df.merge(comparison_df, on='source', how='left')
            
            df_with_comparison['quality_diff'] = df_with_comparison['quality_vs_tb'] - df_with_comparison['quality_vs_tb_comparison']
            df_with_comparison['similarity_diff'] = df_with_comparison['similarity_to_old_translation'] - df_with_comparison['similarity_to_old_translation_comparison']
            
            quality_diff = df_with_comparison.groupby('translator_name')['quality_diff'].agg(agg_funcs).reset_index().set_axis(
                ['translator_name'] + agg_names, axis=1
            )
            
            similarity_diff = df_with_comparison.groupby('translator_name')['similarity_diff'].agg(agg_funcs).reset_index().set_axis(
                ['translator_name'] + agg_names, axis=1
            )
            
            display(HTML(f"<h4>Difference in Quality Versus {compare_to_column}</h4>"))
            display(quality_diff.sort_values(order_by, ascending=False))
            
            display(HTML(f"<h4>Difference in Similarity to Translation Bureau Translation Versus {compare_to_column}</h4>"))
            display(similarity_diff.sort_values(order_by, ascending=False))
            
        else:
            display(HTML("<h4>Quality of Translations Versus Translation Bureau</h4>"))
            display(quality_stats.sort_values(order_by, ascending=False))
            
            display(HTML("<h4>Similarity to Translation Bureau Translation</h4>"))
            display(similarity_stats.sort_values(order_by, ascending=False))

In [8]:
display_stats(df_test_no_rules, heading="\nTest Data - no find and replace\n")
display_stats(df_test_rules, heading="\nTest Data - with preferential translations\n")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.05,-0.35,0.57,-0.07,-0.02,0.02,0.13,0.41
5,opus_mt_base,0.05,-0.35,0.58,-0.07,-0.02,0.03,0.15,0.41
4,mbart50_mmt_finetuned,0.05,-0.2,0.57,-0.07,-0.02,0.03,0.13,0.41
3,mbart50_mmt_base,0.05,-0.13,0.57,-0.08,-0.02,0.03,0.15,0.41
2,m2m100_418m_finetuned,0.05,-0.35,0.56,-0.07,-0.02,0.03,0.13,0.42
1,m2m100_418m_base,0.06,-0.79,0.58,-0.09,-0.02,0.04,0.16,0.43
0,best_model,0.07,-0.06,0.58,-0.02,0.01,0.05,0.17,0.44


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.91,0.08,1.0,0.49,0.8,0.93,0.98,1.0
5,opus_mt_base,0.9,0.29,1.0,0.49,0.79,0.92,0.97,0.99
4,mbart50_mmt_finetuned,0.9,0.08,1.0,0.49,0.8,0.93,0.98,1.0
3,mbart50_mmt_base,0.89,0.3,1.0,0.52,0.79,0.92,0.97,1.0
2,m2m100_418m_finetuned,0.9,0.08,1.0,0.48,0.79,0.92,0.98,1.0
1,m2m100_418m_base,0.87,0.19,1.0,0.47,0.76,0.89,0.96,0.99
0,best_model,0.89,0.34,1.0,0.48,0.78,0.91,0.97,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.05,-0.35,0.57,-0.07,-0.02,0.02,0.13,0.41
5,opus_mt_base,0.05,-0.35,0.58,-0.07,-0.02,0.03,0.15,0.41
4,mbart50_mmt_finetuned,0.05,-0.2,0.57,-0.07,-0.02,0.03,0.13,0.41
3,mbart50_mmt_base,0.05,-0.13,0.57,-0.08,-0.02,0.03,0.15,0.41
2,m2m100_418m_finetuned,0.05,-0.35,0.56,-0.07,-0.02,0.03,0.13,0.42
1,m2m100_418m_base,0.06,-0.79,0.58,-0.09,-0.02,0.04,0.16,0.43
0,best_model,0.07,-0.06,0.58,-0.02,0.01,0.05,0.17,0.44


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.91,0.08,1.0,0.49,0.8,0.93,0.98,1.0
5,opus_mt_base,0.9,0.29,1.0,0.49,0.79,0.92,0.97,0.99
4,mbart50_mmt_finetuned,0.9,0.08,1.0,0.49,0.8,0.93,0.98,1.0
3,mbart50_mmt_base,0.89,0.3,1.0,0.52,0.79,0.92,0.97,1.0
2,m2m100_418m_finetuned,0.9,0.08,1.0,0.48,0.79,0.92,0.98,1.0
1,m2m100_418m_base,0.87,0.19,1.0,0.47,0.76,0.89,0.96,0.99
0,best_model,0.89,0.34,1.0,0.48,0.78,0.91,0.97,1.0


In [9]:
display_stats(df_train_no_rules, heading="\nTraining Data - no find and replace\n")
display_stats(df_train_rules, heading="\nTraining Data - with preferential translations\n")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.03,-0.14,0.41,-0.04,-0.01,0.02,0.08,0.16
5,opus_mt_base,0.03,-0.09,0.44,-0.04,-0.01,0.02,0.09,0.16
4,mbart50_mmt_finetuned,0.03,-0.26,0.35,-0.03,-0.01,0.02,0.08,0.16
3,mbart50_mmt_base,0.03,-0.11,0.41,-0.05,-0.01,0.02,0.09,0.18
2,m2m100_418m_finetuned,0.03,-0.13,0.38,-0.04,-0.01,0.02,0.08,0.16
1,m2m100_418m_base,0.03,-0.33,0.37,-0.07,-0.01,0.02,0.09,0.18
0,best_model,0.04,-0.02,0.44,-0.01,0.0,0.03,0.1,0.19


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.95,0.4,1.0,0.79,0.89,0.96,1.0,1.0
5,opus_mt_base,0.94,0.41,1.0,0.78,0.87,0.95,0.99,1.0
4,mbart50_mmt_finetuned,0.95,0.46,1.0,0.78,0.89,0.96,0.99,1.0
3,mbart50_mmt_base,0.93,0.35,1.0,0.77,0.86,0.94,0.98,1.0
2,m2m100_418m_finetuned,0.94,0.51,1.0,0.8,0.88,0.95,0.99,1.0
1,m2m100_418m_base,0.92,0.45,1.0,0.72,0.85,0.93,0.98,1.0
0,best_model,0.93,0.41,1.0,0.77,0.86,0.95,0.99,1.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.03,-0.14,0.41,-0.04,-0.01,0.02,0.08,0.16
5,opus_mt_base,0.03,-0.09,0.44,-0.04,-0.01,0.02,0.09,0.16
4,mbart50_mmt_finetuned,0.03,-0.26,0.35,-0.03,-0.01,0.02,0.08,0.16
3,mbart50_mmt_base,0.03,-0.11,0.41,-0.05,-0.01,0.02,0.09,0.18
2,m2m100_418m_finetuned,0.03,-0.13,0.38,-0.04,-0.01,0.02,0.08,0.16
1,m2m100_418m_base,0.03,-0.33,0.37,-0.07,-0.01,0.02,0.09,0.18
0,best_model,0.04,-0.02,0.44,-0.01,0.0,0.03,0.1,0.19


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.95,0.4,1.0,0.79,0.89,0.96,1.0,1.0
5,opus_mt_base,0.94,0.41,1.0,0.78,0.87,0.95,0.99,1.0
4,mbart50_mmt_finetuned,0.95,0.46,1.0,0.78,0.89,0.96,0.99,1.0
3,mbart50_mmt_base,0.93,0.35,1.0,0.77,0.86,0.94,0.98,1.0
2,m2m100_418m_finetuned,0.94,0.51,1.0,0.8,0.88,0.95,0.99,1.0
1,m2m100_418m_base,0.92,0.45,1.0,0.72,0.85,0.93,0.98,1.0
0,best_model,0.93,0.41,1.0,0.77,0.86,0.95,0.99,1.0


# Best Results Mixture of Experts Translation Model Proof of Concept 
### (if we deploy all models and take the best result, we can improve our results)

In [10]:
display_stats(df_test_no_rules, heading="\nDifference Versus Best Results - Test Data - no find and replace\n", compare_to_column="best_model")
display_stats(df_test_rules, heading="\nDifference Versus Best Results - Test Data - with preferential translations\n", compare_to_column="best_model")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,-0.03,-0.41,0.0,-0.15,-0.07,-0.02,0.0,0.0
5,opus_mt_base,-0.03,-0.41,0.0,-0.13,-0.06,-0.01,0.0,0.0
4,mbart50_mmt_finetuned,-0.03,-0.39,0.0,-0.16,-0.07,-0.02,0.0,0.0
3,mbart50_mmt_base,-0.02,-0.15,0.01,-0.11,-0.06,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.03,-0.41,0.0,-0.18,-0.07,-0.02,0.0,0.0
1,m2m100_418m_base,-0.02,-0.8,0.0,-0.12,-0.04,-0.01,0.0,0.0
0,best_model,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.02,-0.52,0.26,-0.08,-0.02,0.01,0.07,0.14
5,opus_mt_base,0.01,-0.39,0.16,-0.1,-0.03,0.0,0.05,0.11
4,mbart50_mmt_finetuned,0.01,-0.52,0.3,-0.09,-0.02,0.0,0.06,0.14
3,mbart50_mmt_base,0.0,-0.17,0.15,-0.08,-0.03,0.0,0.04,0.08
2,m2m100_418m_finetuned,0.01,-0.52,0.26,-0.11,-0.03,0.0,0.05,0.13
1,m2m100_418m_base,-0.02,-0.8,0.08,-0.22,-0.06,-0.0,0.0,0.04
0,best_model,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,-0.03,-0.41,0.0,-0.15,-0.07,-0.02,0.0,0.0
5,opus_mt_base,-0.03,-0.41,0.0,-0.13,-0.06,-0.01,0.0,0.0
4,mbart50_mmt_finetuned,-0.03,-0.39,0.0,-0.16,-0.07,-0.02,0.0,0.0
3,mbart50_mmt_base,-0.02,-0.15,0.01,-0.11,-0.06,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.03,-0.41,0.0,-0.18,-0.07,-0.02,0.0,0.0
1,m2m100_418m_base,-0.02,-0.8,0.0,-0.12,-0.04,-0.01,0.0,0.0
0,best_model,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.02,-0.52,0.26,-0.08,-0.02,0.01,0.07,0.14
5,opus_mt_base,0.01,-0.39,0.16,-0.1,-0.03,0.0,0.05,0.11
4,mbart50_mmt_finetuned,0.01,-0.52,0.3,-0.09,-0.02,0.0,0.06,0.14
3,mbart50_mmt_base,0.0,-0.17,0.15,-0.08,-0.03,0.0,0.04,0.08
2,m2m100_418m_finetuned,0.01,-0.52,0.26,-0.11,-0.03,0.0,0.05,0.13
1,m2m100_418m_base,-0.02,-0.8,0.08,-0.22,-0.06,-0.0,0.0,0.04
0,best_model,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
display_stats(df_train_no_rules, heading="\nDifference Versus Best Results - Training Data - no find and replace\n", compare_to_column="best_model")
display_stats(df_train_rules, heading="\nDifference Versus Best Results - Training Data - with preferential translations\n", compare_to_column="best_model")

Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,-0.02,-0.24,0.0,-0.1,-0.05,-0.01,0.0,0.0
5,opus_mt_base,-0.01,-0.18,0.0,-0.08,-0.04,-0.01,0.0,0.0
4,mbart50_mmt_finetuned,-0.02,-0.31,0.0,-0.11,-0.05,-0.01,0.0,0.0
3,mbart50_mmt_base,-0.01,-0.15,0.0,-0.09,-0.03,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.02,-0.17,0.0,-0.11,-0.05,-0.01,0.0,0.0
1,m2m100_418m_base,-0.01,-0.38,0.01,-0.1,-0.04,-0.01,0.0,0.0
0,best_model,0.0,-0.01,0.01,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.02,-0.09,0.24,-0.05,-0.01,0.01,0.05,0.13
5,opus_mt_base,0.0,-0.15,0.19,-0.07,-0.02,0.0,0.03,0.08
4,mbart50_mmt_finetuned,0.01,-0.39,0.21,-0.07,-0.01,0.0,0.05,0.12
3,mbart50_mmt_base,-0.0,-0.15,0.2,-0.08,-0.03,0.0,0.02,0.07
2,m2m100_418m_finetuned,0.01,-0.13,0.19,-0.06,-0.02,0.0,0.04,0.12
1,m2m100_418m_base,-0.02,-0.47,0.1,-0.12,-0.05,-0.01,0.0,0.02
0,best_model,0.0,-0.02,0.02,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,-0.02,-0.24,0.0,-0.1,-0.05,-0.01,0.0,0.0
5,opus_mt_base,-0.01,-0.18,0.0,-0.08,-0.04,-0.01,0.0,0.0
4,mbart50_mmt_finetuned,-0.02,-0.31,0.0,-0.11,-0.05,-0.01,0.0,0.0
3,mbart50_mmt_base,-0.01,-0.15,0.0,-0.09,-0.03,-0.01,0.0,0.0
2,m2m100_418m_finetuned,-0.02,-0.17,0.0,-0.11,-0.05,-0.01,0.0,0.0
1,m2m100_418m_base,-0.01,-0.38,0.01,-0.1,-0.04,-0.01,0.0,0.0
0,best_model,0.0,-0.01,0.01,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.02,-0.09,0.24,-0.05,-0.01,0.01,0.05,0.13
5,opus_mt_base,0.0,-0.15,0.19,-0.07,-0.02,0.0,0.03,0.08
4,mbart50_mmt_finetuned,0.01,-0.39,0.21,-0.07,-0.01,0.0,0.05,0.12
3,mbart50_mmt_base,-0.0,-0.15,0.2,-0.08,-0.03,0.0,0.02,0.07
2,m2m100_418m_finetuned,0.01,-0.13,0.19,-0.06,-0.02,0.0,0.04,0.12
1,m2m100_418m_base,-0.02,-0.47,0.1,-0.12,-0.05,-0.01,0.0,0.02
0,best_model,0.0,-0.02,0.02,0.0,0.0,0.0,0.0,0.0


# Survey Results

In [12]:
def transform_categorical_counts(df, columns_to_drop=None):
    if columns_to_drop:
        df = df.drop(columns_to_drop, axis=1)
    
    all_values = set()
    for col in df.columns:
        unique_vals = df[col].dropna().unique()
        all_values.update(unique_vals)
    
    all_values = sorted(list(all_values))
    
    result_data = {}
    
    for value in all_values:
        result_data[value] = [(df[col] == value).sum() for col in df.columns]
    
    result_df = pd.DataFrame(result_data, index=df.columns)
    
    return result_df


def results_summary(df):
    result_weighting = {'bad': -2, 'good': 1, 'best': 2, 'worse': -1, 'better': 1}
    
    df = pd.DataFrame(df.apply(lambda row: sum(row.get(cat, 0) * weight for cat, weight in result_weighting.items()), axis=1))
    df.columns = ['Score']
    return df.sort_values('Score', ascending=False)


In [13]:
df_survey = pd.read_csv('translation_quality_results.csv')
df_survey_results = transform_categorical_counts(df_survey, ['source', 'corpus_type'])
display(df_survey_results)
display(results_summary(df_survey_results))

Unnamed: 0,bad,best,better,good,worse
translation_bureau,6,2,0,4,0
m2m100_418m_base,7,0,0,1,0
m2m100_418m_finetuned,1,1,1,3,0
mbart50_mmt_base,2,0,0,1,1
mbart50_mmt_finetuned,2,3,0,1,0
nllb_3b_base_researchonly,2,3,0,2,0
opus_mt_base,4,0,1,1,1
opus_mt_finetuned,1,1,0,1,0


Unnamed: 0,Score
m2m100_418m_finetuned,4
nllb_3b_base_researchonly,4
mbart50_mmt_finetuned,3
opus_mt_finetuned,1
translation_bureau,-4
mbart50_mmt_base,-4
opus_mt_base,-7
m2m100_418m_base,-13
