In [1]:
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
eval_data = "translation_comparison_20250819-0923.csv"
test_data = "translation_comparison_20250819-0834.csv"

df_eval = pd.read_csv(eval_data)
df_test = pd.read_csv(test_data)

df_eval.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
df_eval['quality_vs_tb'] = df_eval['cosine_similarity_vs_source'] - df_eval['cosine_similarity_original_translation']

df_test.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
df_test['quality_vs_tb'] = df_test['cosine_similarity_vs_source'] - df_test['cosine_similarity_original_translation']

In [3]:
def make_quantile(q):
    return lambda x: x.quantile(q)

percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
agg_funcs = ['mean', 'min', 'max'] + [make_quantile(q) for q in percentiles]
agg_names = ['mean', 'min', 'max'] + [f"{int(q*100)}%" for q in percentiles]


In [4]:
print("\nTraining Data (eval set)\n")

print("\nQuality of Translations Versus Translation Bureau")
display(df_eval.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
    ['translator_name'] + agg_names, axis=1
).T.style.hide(axis='columns'))

print("\nSimilarity to Translation Bureau Translation")
display(df_eval.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
    ['translator_name'] + agg_names, axis=1
).T.style.hide(axis='columns'))


Training Data (eval set)


Quality of Translations Versus Translation Bureau


0,1,2,3,4,5,6,7
translator_name,m2m100_418m_base,m2m100_418m_finetuned,mbart50_mmt_base,mbart50_mmt_finetuned,nllb_3b_base_researchonly,opus_mt_base,opus_mt_finetuned
mean,0.028290,0.023899,0.029209,0.022468,0.024089,0.027343,0.023318
min,-0.599434,-0.420567,-0.081033,-0.522761,-0.746219,-0.096407,-0.329822
max,0.337621,0.309711,0.308057,0.281595,0.332338,0.281595,0.281595
1%,-0.059832,-0.055921,-0.042987,-0.051393,-0.146819,-0.038728,-0.028477
10%,-0.016544,-0.007867,-0.010263,-0.007987,-0.010246,-0.007999,-0.007688
50%,0.022854,0.015671,0.021990,0.014051,0.021467,0.019244,0.013205
90%,0.084692,0.072845,0.079452,0.071655,0.079312,0.075529,0.069432
99%,0.162637,0.155740,0.164893,0.145831,0.172822,0.160418,0.151531



Similarity to Translation Bureau Translation


0,1,2,3,4,5,6,7
translator_name,m2m100_418m_base,m2m100_418m_finetuned,mbart50_mmt_base,mbart50_mmt_finetuned,nllb_3b_base_researchonly,opus_mt_base,opus_mt_finetuned
mean,0.920743,0.942053,0.934923,0.947477,0.933325,0.942708,0.951604
min,0.324202,0.390870,0.609370,0.292137,0.159907,0.616460,0.560243
max,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1%,0.754482,0.765924,0.785861,0.798399,0.739324,0.782404,0.804034
10%,0.850013,0.885721,0.874760,0.892231,0.870371,0.881587,0.894310
50%,0.931029,0.953664,0.944287,0.960233,0.948589,0.952244,0.962983
90%,0.978290,0.993363,0.985691,0.994756,0.988396,0.990750,0.996350
99%,0.997477,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000


In [5]:
print("\nTest Data\n")

print("\nQuality of Translations Versus Translation Bureau")
display(df_test.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
    ['translator_name'] + agg_names, axis=1
).T.style.hide(axis='columns'))

print("\nSimilarity to Translation Bureau Translation")
display(df_test.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
    ['translator_name'] + agg_names, axis=1
).T.style.hide(axis='columns'))


Test Data


Quality of Translations Versus Translation Bureau


0,1,2,3,4,5,6,7
translator_name,m2m100_418m_base,m2m100_418m_finetuned,mbart50_mmt_base,mbart50_mmt_finetuned,nllb_3b_base_researchonly,opus_mt_base,opus_mt_finetuned
mean,0.055612,0.046195,0.052851,0.046298,0.051031,0.052068,0.045759
min,-0.654153,-0.328275,-0.241490,-0.260617,-0.650844,-0.101257,-0.256843
max,0.464943,0.459392,0.485087,0.468772,0.485087,0.486341,0.486341
1%,-0.134388,-0.075936,-0.089906,-0.073479,-0.113782,-0.071131,-0.066776
10%,-0.016675,-0.020564,-0.022188,-0.021329,-0.017966,-0.019033,-0.022032
50%,0.042046,0.028865,0.035405,0.028552,0.034847,0.031875,0.026217
90%,0.158027,0.143633,0.155206,0.143836,0.155810,0.156204,0.139519
99%,0.368444,0.372749,0.364382,0.358291,0.368420,0.369999,0.365299



Similarity to Translation Bureau Translation


0,1,2,3,4,5,6,7
translator_name,m2m100_418m_base,m2m100_418m_finetuned,mbart50_mmt_base,mbart50_mmt_finetuned,nllb_3b_base_researchonly,opus_mt_base,opus_mt_finetuned
mean,0.868412,0.894506,0.888973,0.898612,0.888326,0.896825,0.902156
min,0.112431,0.046630,0.281137,0.038575,0.228907,0.416430,0.029262
max,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1%,0.502442,0.481455,0.561041,0.490798,0.501639,0.569672,0.546273
10%,0.747309,0.783257,0.775022,0.785974,0.769137,0.784082,0.786901
50%,0.895265,0.923191,0.913791,0.927203,0.916959,0.921740,0.931577
90%,0.960510,0.973636,0.970100,0.976684,0.972134,0.974451,0.979817
99%,0.988335,0.995551,0.993126,0.996042,0.994659,0.993700,0.997240


In [7]:
# opus is the best, but all are good

# how do i choose a model if they all perform well and they're not very different?
OPTIONS:
* just do opus
* finetune all of them, and recheck performance
* pick the smallest one
* best bad values - highest min / 1%
* best total values (even if they're very close)