In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
def display_stats(filepath, order_by="mean"):
    df = pd.read_csv(filepath)
    
    df.rename(columns={"cosine_similarity_vs_target": "similarity_to_old_translation"}, inplace=True)
    df['quality_vs_tb'] = df['cosine_similarity_vs_source'] - df['cosine_similarity_original_translation']
    
    def make_quantile(q):
        return lambda x: x.quantile(q)
    
    percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
    agg_funcs = ['mean', 'min', 'max'] + [make_quantile(q) for q in percentiles]
    agg_names = ['mean', 'min', 'max'] + [f"{int(q*100)}%" for q in percentiles]
    
    print("\nQuality of Translations Versus Translation Bureau")
    display(df.groupby('translator_name')['quality_vs_tb'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    ).sort_values(order_by, ascending=False))
    
    print("\nSimilarity to Translation Bureau Translation")
    display(df.groupby('translator_name')['similarity_to_old_translation'].agg(agg_funcs).reset_index().set_axis(
        ['translator_name'] + agg_names, axis=1
    ).sort_values(order_by, ascending=False))
    
eval_data = "translation_comparison_20250819-0923.csv"
test_data = "translation_comparison_20250819-0834.csv"

In [3]:
# 1k samples, testing data (not used for training, slightly less clean)
display_stats(test_data)


Quality of Translations Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
0,m2m100_418m_base,0.06,-0.65,0.46,-0.13,-0.02,0.04,0.16,0.37
2,mbart50_mmt_base,0.05,-0.24,0.49,-0.09,-0.02,0.04,0.16,0.36
5,opus_mt_base,0.05,-0.1,0.49,-0.07,-0.02,0.03,0.16,0.37
4,nllb_3b_base_researchonly,0.05,-0.65,0.49,-0.11,-0.02,0.03,0.16,0.37
3,mbart50_mmt_finetuned,0.05,-0.26,0.47,-0.07,-0.02,0.03,0.14,0.36
1,m2m100_418m_finetuned,0.05,-0.33,0.46,-0.08,-0.02,0.03,0.14,0.37
6,opus_mt_finetuned,0.05,-0.26,0.49,-0.07,-0.02,0.03,0.14,0.37



Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.9,0.03,1.0,0.55,0.79,0.93,0.98,1.0
3,mbart50_mmt_finetuned,0.9,0.04,1.0,0.49,0.79,0.93,0.98,1.0
5,opus_mt_base,0.9,0.42,1.0,0.57,0.78,0.92,0.97,0.99
1,m2m100_418m_finetuned,0.89,0.05,1.0,0.48,0.78,0.92,0.97,1.0
2,mbart50_mmt_base,0.89,0.28,1.0,0.56,0.78,0.91,0.97,0.99
4,nllb_3b_base_researchonly,0.89,0.23,1.0,0.5,0.77,0.92,0.97,0.99
0,m2m100_418m_base,0.87,0.11,1.0,0.5,0.75,0.9,0.96,0.99


In [4]:
# 1k samples, training data (just from the eval set)
display_stats(eval_data)


Quality of Translations Versus Translation Bureau


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
2,mbart50_mmt_base,0.03,-0.08,0.31,-0.04,-0.01,0.02,0.08,0.16
0,m2m100_418m_base,0.03,-0.6,0.34,-0.06,-0.02,0.02,0.08,0.16
5,opus_mt_base,0.03,-0.1,0.28,-0.04,-0.01,0.02,0.08,0.16
4,nllb_3b_base_researchonly,0.02,-0.75,0.33,-0.15,-0.01,0.02,0.08,0.17
1,m2m100_418m_finetuned,0.02,-0.42,0.31,-0.06,-0.01,0.02,0.07,0.16
6,opus_mt_finetuned,0.02,-0.33,0.28,-0.03,-0.01,0.01,0.07,0.15
3,mbart50_mmt_finetuned,0.02,-0.52,0.28,-0.05,-0.01,0.01,0.07,0.15



Similarity to Translation Bureau Translation


Unnamed: 0,translator_name,mean,min,max,1%,10%,50%,90%,99%
6,opus_mt_finetuned,0.95,0.56,1.0,0.8,0.89,0.96,1.0,1.0
3,mbart50_mmt_finetuned,0.95,0.29,1.0,0.8,0.89,0.96,0.99,1.0
5,opus_mt_base,0.94,0.62,1.0,0.78,0.88,0.95,0.99,1.0
1,m2m100_418m_finetuned,0.94,0.39,1.0,0.77,0.89,0.95,0.99,1.0
2,mbart50_mmt_base,0.93,0.61,1.0,0.79,0.87,0.94,0.99,1.0
4,nllb_3b_base_researchonly,0.93,0.16,1.0,0.74,0.87,0.95,0.99,1.0
0,m2m100_418m_base,0.92,0.32,1.0,0.75,0.85,0.93,0.98,1.0


# blind survey to evaluate finetuning

In [5]:
df_survey = pd.read_csv('translation_quality_results.csv')

In [41]:
def transform_categorical_counts(df, columns_to_drop=None):
    if columns_to_drop:
        df = df.drop(columns_to_drop, axis=1)
    
    all_values = set()
    for col in df.columns:
        unique_vals = df[col].dropna().unique()
        all_values.update(unique_vals)
    
    all_values = sorted(list(all_values))
    
    has_nan = df.isnull().any().any()
    if has_nan:
        all_values.append('NaN')
    
    result_data = {}
    
    for value in all_values:
        if value == 'NaN':
            result_data[value] = [df[col].isnull().sum() for col in df.columns]
        else:
            result_data[value] = [(df[col] == value).sum() for col in df.columns]
    
    result_df = pd.DataFrame(result_data, index=df.columns)
    
    return result_df


def results_summary(df):
    result_weighting = {'bad': -2, 'good': 1, 'best': 2, 'worse': -1, 'better': 1}
    
    df = pd.DataFrame(df.apply(lambda row: sum(row.get(cat, 0) * weight for cat, weight in result_weighting.items()), axis=1))
    df.columns = ['Score']
    return df.sort_values('Score', ascending=False)


In [42]:
df_survey_results = transform_categorical_counts(df_survey, ['source', 'corpus_type'])
display(df_survey_results)
display(results_summary(df_survey_results))

Unnamed: 0,bad,best,better,good,worse,NaN
translation_bureau,6,2,0,4,0,5
m2m100_418m_base,7,0,0,1,0,9
m2m100_418m_finetuned,1,1,1,3,0,11
mbart50_mmt_base,2,0,0,1,1,13
mbart50_mmt_finetuned,2,3,0,1,0,11
nllb_3b_base_researchonly,2,3,0,2,0,10
opus_mt_base,4,0,1,1,1,10
opus_mt_finetuned,1,1,0,1,0,14


Unnamed: 0,Score
m2m100_418m_finetuned,4
nllb_3b_base_researchonly,4
mbart50_mmt_finetuned,3
opus_mt_finetuned,1
translation_bureau,-4
mbart50_mmt_base,-4
opus_mt_base,-7
m2m100_418m_base,-13
