In [1]:
import pandas as pd

pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
def display_stats(dataframe, dataframe2=None, order_by="translator_name"):
    df = dataframe.copy()
    df.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
    df['quality_vs_tb'] = df['cosine_similarity_vs_source'] - df['cosine_similarity_original_translation']
    
    if dataframe2 is not None:
        df2 = dataframe2.copy()
        df2.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
        df2['quality_vs_tb'] = df2['cosine_similarity_vs_source'] - df2['cosine_similarity_original_translation']
    
    def make_quantile(q):
        return lambda x: x.quantile(q)
    
    percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
    agg_funcs = ['mean', 'min', 'max'] + [make_quantile(q) for q in percentiles]
    agg_names = ['mean', 'min', 'max'] + [f"{int(q*100)}%" for q in percentiles]
    
    quality_stats = df.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    )
    
    similarity_stats = df.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    )
    
    if dataframe2 is not None:
        quality_stats2 = df2.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        similarity_stats2 = df2.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
            ['translator_name'] + agg_names, axis=1
        )
        
        quality_diff = quality_stats.merge(quality_stats2, on='translator_name', suffixes=('', '_2'))
        for col in agg_names:
            quality_diff[col] = quality_diff[col] - quality_diff[col + '_2']
        quality_diff = quality_diff[['translator_name'] + agg_names]
        
        similarity_diff = similarity_stats.merge(similarity_stats2, on='translator_name', suffixes=('', '_2'))
        for col in agg_names:
            similarity_diff[col] = similarity_diff[col] - similarity_diff[col + '_2']
        similarity_diff = similarity_diff[['translator_name'] + agg_names]
        
        print("\nDifference in Quality Versus Translation Bureau")
        display(quality_diff.sort_values(order_by, ascending=False))
        
        print("\nDifference in Similarity to Translation Bureau Translation")
        display(similarity_diff.sort_values(order_by, ascending=False))
        
    else:
        print("\nQuality of Translations Versus Translation Bureau")
        display(quality_stats.sort_values(order_by, ascending=False))
        
        print("\nSimilarity to Translation Bureau Translation")
        display(similarity_stats.sort_values(order_by, ascending=False))


In [3]:
test_data = "translation_results/translation_comparison_20250904-1456.csv"
eval_data = "translation_results/translation_comparison_20250904-2340.csv"
test_error_data = "translation_results/translation_errors_20250904-1456.json"
eval_error_data = "translation_results/translation_errors_20250904-2340.json"

df_test_data = pd.read_csv(test_data)
df_eval_data = pd.read_csv(eval_data)
df_test_error_data = pd.read_json(test_error_data).T
df_eval_error_data = pd.read_json(eval_error_data).T

eval_errors_list = df_eval_error_data.index.to_list()
test_errors_list = df_test_error_data.index.to_list()

df_eval_data['error'] = False
df_eval_data.iloc[
    [x for sample in eval_errors_list for x in range((sample - 1) * 7, sample * 7)], 
    df_eval_data.columns.get_loc('error')
] = True

df_test_data['error'] = False
df_test_data.iloc[
    [x for sample in test_errors_list for x in range((sample - 1) * 7, sample * 7)], 
    df_test_data.columns.get_loc('error')
] = True


In [4]:
len(eval_errors_list)

570

In [5]:
len(test_errors_list)

914

# overall performance

In [6]:
# 10k samples, testing data (not used for training, slightly less clean)
display_stats(df_test_data)


Quality of Translations Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.05,-0.58,0.72,-0.07,-0.02,0.03,0.14,0.34
5,opus_mt_base,0.05,-0.35,0.72,-0.07,-0.02,0.03,0.15,0.34
4,nllb_3b_base_researchonly,0.05,-0.93,0.71,-0.12,-0.02,0.04,0.15,0.35
3,mbart50_mmt_finetuned,0.05,-0.6,0.68,-0.07,-0.02,0.03,0.14,0.33
2,mbart50_mmt_base,0.05,-0.29,0.68,-0.07,-0.02,0.04,0.15,0.34
1,m2m100_418m_finetuned,0.05,-0.66,0.72,-0.08,-0.02,0.03,0.14,0.34
0,m2m100_418m_base,0.06,-0.72,0.72,-0.09,-0.02,0.04,0.16,0.35



Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.9,0.03,1.0,0.55,0.8,0.93,0.98,1.0
5,opus_mt_base,0.9,0.24,1.0,0.58,0.79,0.92,0.97,1.0
4,nllb_3b_base_researchonly,0.89,-0.05,1.0,0.51,0.78,0.91,0.97,0.99
3,mbart50_mmt_finetuned,0.9,0.01,1.0,0.54,0.79,0.92,0.98,1.0
2,mbart50_mmt_base,0.89,0.19,1.0,0.58,0.79,0.91,0.97,0.99
1,m2m100_418m_finetuned,0.89,0.01,1.0,0.52,0.79,0.92,0.97,1.0
0,m2m100_418m_base,0.87,0.07,1.0,0.52,0.76,0.89,0.96,0.99


In [7]:
# 10k samples, training data (just from the eval set)
display_stats(df_eval_data)


Quality of Translations Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.03,-0.62,0.34,-0.05,-0.01,0.02,0.07,0.16
5,opus_mt_base,0.03,-0.63,0.34,-0.05,-0.01,0.02,0.08,0.17
4,nllb_3b_base_researchonly,0.03,-0.88,0.34,-0.09,-0.01,0.02,0.08,0.16
3,mbart50_mmt_finetuned,0.03,-0.43,0.34,-0.05,-0.01,0.02,0.08,0.16
2,mbart50_mmt_base,0.03,-0.74,0.33,-0.06,-0.01,0.02,0.08,0.17
1,m2m100_418m_finetuned,0.03,-0.58,0.33,-0.05,-0.01,0.02,0.07,0.16
0,m2m100_418m_base,0.03,-0.74,0.32,-0.09,-0.02,0.02,0.09,0.17



Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.95,0.06,1.0,0.79,0.89,0.96,1.0,1.0
5,opus_mt_base,0.94,0.29,1.0,0.77,0.88,0.95,0.99,1.0
4,nllb_3b_base_researchonly,0.93,-0.03,1.0,0.74,0.87,0.95,0.99,1.0
3,mbart50_mmt_finetuned,0.95,0.06,1.0,0.78,0.88,0.96,0.99,1.0
2,mbart50_mmt_base,0.93,0.21,1.0,0.77,0.87,0.94,0.99,1.0
1,m2m100_418m_finetuned,0.94,0.06,1.0,0.77,0.88,0.95,0.99,1.0
0,m2m100_418m_base,0.92,0.17,1.0,0.73,0.85,0.93,0.98,1.0


# compare quality with and without errors 
### (errors just return default translations with no find and replace)

In [8]:
print("Test Data - with errors (no find and replace)")
display_stats(df_test_data[df_test_data.error])

Test Data - with errors (no find and replace)

Quality of Translations Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.06,-0.58,0.72,-0.21,-0.01,0.04,0.17,0.41
5,opus_mt_base,0.07,-0.35,0.72,-0.06,-0.01,0.05,0.2,0.43
4,nllb_3b_base_researchonly,0.06,-0.93,0.71,-0.66,-0.01,0.04,0.19,0.39
3,mbart50_mmt_finetuned,0.06,-0.6,0.68,-0.2,-0.01,0.04,0.17,0.38
2,mbart50_mmt_base,0.08,-0.12,0.68,-0.06,-0.0,0.05,0.2,0.43
1,m2m100_418m_finetuned,0.05,-0.6,0.72,-0.33,-0.02,0.04,0.17,0.4
0,m2m100_418m_base,0.08,-0.55,0.72,-0.07,-0.0,0.05,0.2,0.43



Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.88,0.03,1.0,0.37,0.77,0.92,0.97,0.99
5,opus_mt_base,0.88,0.24,1.0,0.48,0.75,0.91,0.97,0.99
4,nllb_3b_base_researchonly,0.86,-0.05,1.0,0.21,0.73,0.9,0.96,0.99
3,mbart50_mmt_finetuned,0.88,0.01,1.0,0.34,0.76,0.92,0.97,0.99
2,mbart50_mmt_base,0.88,0.19,0.99,0.51,0.75,0.91,0.96,0.99
1,m2m100_418m_finetuned,0.87,0.01,1.0,0.25,0.75,0.91,0.97,0.99
0,m2m100_418m_base,0.85,0.1,1.0,0.44,0.73,0.88,0.95,0.98


In [9]:
print("Test Data - without errors (preferential translations find and replaced)")
display_stats(df_test_data[~df_test_data.error])

Test Data - without errors (preferential translations find and replaced)

Quality of Translations Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.05,-0.48,0.64,-0.07,-0.02,0.03,0.13,0.32
5,opus_mt_base,0.05,-0.32,0.64,-0.07,-0.02,0.03,0.14,0.33
4,nllb_3b_base_researchonly,0.05,-0.86,0.64,-0.1,-0.02,0.03,0.15,0.33
3,mbart50_mmt_finetuned,0.05,-0.32,0.64,-0.07,-0.02,0.03,0.13,0.32
2,mbart50_mmt_base,0.05,-0.29,0.66,-0.07,-0.02,0.03,0.14,0.33
1,m2m100_418m_finetuned,0.05,-0.66,0.65,-0.08,-0.02,0.03,0.14,0.33
0,m2m100_418m_base,0.06,-0.72,0.67,-0.09,-0.02,0.04,0.15,0.34



Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.9,0.08,1.0,0.58,0.8,0.93,0.98,1.0
5,opus_mt_base,0.9,0.3,1.0,0.59,0.79,0.92,0.97,1.0
4,nllb_3b_base_researchonly,0.89,-0.02,1.0,0.53,0.78,0.91,0.97,1.0
3,mbart50_mmt_finetuned,0.9,0.12,1.0,0.57,0.8,0.93,0.98,1.0
2,mbart50_mmt_base,0.89,0.33,1.0,0.59,0.79,0.91,0.97,0.99
1,m2m100_418m_finetuned,0.9,0.09,1.0,0.56,0.79,0.92,0.97,1.0
0,m2m100_418m_base,0.87,0.07,1.0,0.53,0.77,0.89,0.96,0.99


In [10]:
print("Eval Data - with errors (no find and replace)")
display_stats(df_eval_data[df_eval_data.error])

Eval Data - with errors (no find and replace)

Quality of Translations Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.03,-0.17,0.27,-0.08,-0.01,0.02,0.08,0.14
5,opus_mt_base,0.03,-0.14,0.27,-0.08,-0.0,0.02,0.08,0.16
4,nllb_3b_base_researchonly,0.02,-0.79,0.28,-0.18,-0.01,0.02,0.08,0.15
3,mbart50_mmt_finetuned,0.03,-0.17,0.27,-0.08,-0.01,0.02,0.08,0.14
2,mbart50_mmt_base,0.03,-0.13,0.27,-0.06,-0.01,0.03,0.09,0.17
1,m2m100_418m_finetuned,0.02,-0.41,0.26,-0.1,-0.01,0.02,0.07,0.15
0,m2m100_418m_base,0.03,-0.17,0.27,-0.09,-0.01,0.03,0.09,0.18



Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.94,0.73,1.0,0.81,0.88,0.95,0.99,1.0
5,opus_mt_base,0.93,0.73,1.0,0.79,0.87,0.94,0.98,1.0
4,nllb_3b_base_researchonly,0.92,-0.03,1.0,0.7,0.85,0.94,0.98,0.99
3,mbart50_mmt_finetuned,0.94,0.68,1.0,0.81,0.87,0.95,0.99,1.0
2,mbart50_mmt_base,0.93,0.73,1.0,0.79,0.86,0.94,0.98,1.0
1,m2m100_418m_finetuned,0.94,0.1,1.0,0.76,0.87,0.95,0.99,1.0
0,m2m100_418m_base,0.91,0.69,1.0,0.75,0.85,0.92,0.97,0.99


In [11]:
print("Eval Data - without errors (preferential translations find and replaced)")
display_stats(df_eval_data[~df_eval_data.error])

Eval Data - without errors (preferential translations find and replaced)

Quality of Translations Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.03,-0.62,0.34,-0.05,-0.01,0.02,0.07,0.16
5,opus_mt_base,0.03,-0.63,0.34,-0.05,-0.01,0.02,0.08,0.17
4,nllb_3b_base_researchonly,0.03,-0.88,0.34,-0.09,-0.01,0.02,0.08,0.16
3,mbart50_mmt_finetuned,0.03,-0.43,0.34,-0.05,-0.01,0.02,0.07,0.16
2,mbart50_mmt_base,0.03,-0.74,0.33,-0.06,-0.01,0.02,0.08,0.17
1,m2m100_418m_finetuned,0.03,-0.58,0.33,-0.05,-0.01,0.02,0.08,0.16
0,m2m100_418m_base,0.03,-0.74,0.32,-0.09,-0.02,0.02,0.09,0.17



Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.95,0.06,1.0,0.79,0.89,0.96,1.0,1.0
5,opus_mt_base,0.94,0.29,1.0,0.77,0.88,0.95,0.99,1.0
4,nllb_3b_base_researchonly,0.93,0.03,1.0,0.75,0.87,0.95,0.99,1.0
3,mbart50_mmt_finetuned,0.95,0.06,1.0,0.78,0.88,0.96,0.99,1.0
2,mbart50_mmt_base,0.93,0.21,1.0,0.77,0.87,0.94,0.99,1.0
1,m2m100_418m_finetuned,0.94,0.06,1.0,0.77,0.88,0.95,0.99,1.0
0,m2m100_418m_base,0.92,0.17,1.0,0.73,0.85,0.93,0.98,1.0


# Comparison of results with and without errors

In [14]:
print("Test Data, Including Preferential Translations Minus Without Preferential Translations")

display_stats(df_test_data[~df_test_data.error], df_test_data[df_test_data.error])

Including Preferential Translations Minus Without Preferential Translations

Difference in Quality Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,-0.01,0.1,-0.07,0.14,-0.01,-0.01,-0.03,-0.09
5,opus_mt_base,-0.02,0.03,-0.07,-0.01,-0.01,-0.01,-0.06,-0.1
4,nllb_3b_base_researchonly,-0.01,0.07,-0.07,0.56,-0.01,-0.01,-0.04,-0.06
3,mbart50_mmt_finetuned,-0.01,0.28,-0.03,0.14,-0.01,-0.01,-0.03,-0.06
2,mbart50_mmt_base,-0.02,-0.17,-0.02,-0.02,-0.02,-0.02,-0.05,-0.1
1,m2m100_418m_finetuned,-0.01,-0.06,-0.06,0.25,-0.0,-0.01,-0.04,-0.08
0,m2m100_418m_base,-0.02,-0.17,-0.05,-0.03,-0.01,-0.01,-0.05,-0.09



Difference in Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.02,0.05,0.0,0.21,0.03,0.01,0.01,0.01
5,opus_mt_base,0.02,0.06,0.0,0.11,0.04,0.01,0.01,0.01
4,nllb_3b_base_researchonly,0.03,0.03,0.0,0.32,0.05,0.02,0.01,0.01
3,mbart50_mmt_finetuned,0.02,0.11,0.0,0.23,0.04,0.01,0.01,0.01
2,mbart50_mmt_base,0.02,0.14,0.01,0.08,0.03,0.01,0.01,0.01
1,m2m100_418m_finetuned,0.02,0.08,0.0,0.3,0.04,0.01,0.0,0.0
0,m2m100_418m_base,0.02,-0.03,0.0,0.09,0.04,0.01,0.01,0.01


In [15]:
print("Eval Data, Including Preferential Translations Minus Without Preferential Translations")

display_stats(df_eval_data[~df_eval_data.error], df_eval_data[df_eval_data.error])

Including Preferential Translations Minus Without Preferential Translations

Difference in Quality Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,-0.0,-0.45,0.07,0.04,0.0,-0.0,-0.0,0.02
5,opus_mt_base,-0.0,-0.49,0.07,0.03,-0.0,-0.0,-0.0,0.0
4,nllb_3b_base_researchonly,0.0,-0.08,0.07,0.09,-0.0,-0.0,0.0,0.01
3,mbart50_mmt_finetuned,-0.0,-0.27,0.07,0.03,-0.0,-0.0,-0.0,0.02
2,mbart50_mmt_base,-0.01,-0.61,0.06,0.01,-0.01,-0.01,-0.01,0.0
1,m2m100_418m_finetuned,0.0,-0.17,0.07,0.05,0.0,0.0,0.0,0.01
0,m2m100_418m_base,-0.0,-0.56,0.05,0.01,-0.01,-0.0,-0.0,-0.01



Difference in Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.01,-0.67,0.0,-0.02,0.01,0.01,0.01,0.0
5,opus_mt_base,0.0,-0.44,0.0,-0.02,0.0,0.01,0.01,0.0
4,nllb_3b_base_researchonly,0.01,0.07,0.0,0.05,0.01,0.01,0.01,0.01
3,mbart50_mmt_finetuned,0.01,-0.62,0.0,-0.03,0.01,0.01,0.0,0.0
2,mbart50_mmt_base,0.0,-0.52,0.0,-0.02,0.01,0.0,0.01,0.0
1,m2m100_418m_finetuned,0.0,-0.04,0.0,0.01,0.01,0.0,-0.0,-0.0
0,m2m100_418m_base,0.01,-0.52,0.0,-0.01,0.0,0.01,0.01,0.01
