In [1]:
import pandas as pd

from create_jsonl import save_jsonl

# the same dataset after cleaning
df_features = pd.read_pickle("df_with_features.pickle")

In [2]:
percentiles = [0.00135, 0.02275, 0.1587, 0.8413, 0.97725, 0.99865]
columns = ['50%', '0.1%', '2.3%', '15.9%', '84.1%', '97.7%', '99.9%']
df_features.describe(percentiles).loc[columns].T

Unnamed: 0,50%,0.1%,2.3%,15.9%,84.1%,97.7%,99.9%
similarity,0.847,0.701,0.71,0.757,0.922,0.963,0.982
len_ratio,1.214286,0.343222,0.747292,1.045455,1.417178,1.923077,3.927968
verb_ratio,1.0,0.25,0.5,0.75,1.5,3.0,5.0
noun_ratio,1.2,0.375,0.666667,1.0,1.75,4.0,12.0
entity_ratio,0.75,0.1,0.166667,0.333333,1.0,2.0,4.0
clause_ratio,1.0,0.2,0.5,1.0,1.5,3.0,6.0


In [3]:
# exclude similarity < 0.757 (1 stdev below median)

# when similarity < 0.85 (median)
outlier_criteria_s1 = {
    "len_ratio": (0.75, 1.92),  # override with 2 stdev len ratios
    "verb_ratio": (0.75, 1.50),
    "noun_ratio": (1.00, 1.75),
    "entity_ratio": (0.33, 1.00),
    "clause_ratio": (1.00, 1.50),
}

# when similarity < 0.92 (1 stdev above median)
outlier_criteria_s2 = {
    "len_ratio": (0.75, 1.92),
    "verb_ratio": (0.50, 3.00),
    "noun_ratio": (0.67, 4.00),
    "entity_ratio": (0.17, 2.00),
    "clause_ratio": (0.50, 3.00),
}

# all higher similarities
outlier_criteria_s3 = {
    "len_ratio": (0.34, 3.93),
    "verb_ratio": (0.25, 5.00),
    "noun_ratio": (0.38, 12.00),
    "entity_ratio": (0.10, 4.00),
    "clause_ratio": (0.20, 6.00),
}

In [4]:
for outlier_criteria in [outlier_criteria_s1, outlier_criteria_s2, outlier_criteria_s3]:
    print()
    for feature, (x, y) in outlier_criteria.items():
        print(f"{feature:<20}", end="")
        print(f"number below {x}:\t{df_features[df_features[feature] < x].shape[0]:<10}", end="\t\t")
        print(f"number above {y}:\t{df_features[df_features[feature] > y].shape[0]:<10}")


len_ratio           number below 0.75:	17869     		number above 1.92:	17871     
verb_ratio          number below 0.75:	107322    		number above 1.5:	108955    
noun_ratio          number below 1.0:	104712    		number above 1.75:	116633    
entity_ratio        number below 0.33:	79250     		number above 1.0:	84643     
clause_ratio        number below 1.0:	120154    		number above 1.5:	107848    

len_ratio           number below 0.75:	17869     		number above 1.92:	17871     
verb_ratio          number below 0.5:	12007     		number above 3.0:	6404      
noun_ratio          number below 0.67:	17769     		number above 4.0:	14641     
entity_ratio        number below 0.17:	19930     		number above 2.0:	11003     
clause_ratio        number below 0.5:	16753     		number above 3.0:	8048      

len_ratio           number below 0.34:	1024      		number above 3.93:	1047      
verb_ratio          number below 0.25:	599       		number above 5.0:	354       
noun_ratio          number below 0.38

In [5]:
df_features["exclude_low_similarity"] = df_features["similarity"] < 0.757

s1_mask = df_features["similarity"] < 0.85
s2_mask = (df_features["similarity"] >= 0.85) & (df_features["similarity"] < 0.92)
s3_mask = df_features["similarity"] >= 0.92
criteria = {
    "s1": outlier_criteria_s1,
    "s2": outlier_criteria_s2,
    "s3": outlier_criteria_s3,
}

for feature in outlier_criteria_s1:
    col_name = f"exclude_{feature}"
    df_features[col_name] = False

    low1, high1 = outlier_criteria_s1[feature]
    low2, high2 = outlier_criteria_s2[feature]
    low3, high3 = outlier_criteria_s3[feature]

    df_features.loc[s1_mask, col_name] = ~df_features.loc[s1_mask, feature].between(low1, high1)
    df_features.loc[s2_mask, col_name] = ~df_features.loc[s2_mask, feature].between(low2, high2)
    df_features.loc[s3_mask, col_name] = ~df_features.loc[s3_mask, feature].between(low3, high3)


In [6]:
exclusion_columns = [
    'exclude_low_similarity',
    'exclude_len_ratio',
    'exclude_verb_ratio',
    'exclude_noun_ratio',
    'exclude_entity_ratio',
    'exclude_clause_ratio',
]
df_features["exclude"] = df_features[exclusion_columns].any(axis=1)

In [7]:
# how many exclusions does this lead to?

df_features[exclusion_columns + ['exclude']].sum()

exclude_low_similarity    121566
exclude_len_ratio          35269
exclude_verb_ratio        125555
exclude_noun_ratio        141034
exclude_entity_ratio       99526
exclude_clause_ratio      131022
exclude                   346909
dtype: int64

In [8]:
# how many translations are left after this cleaning? (half a million)

df_features[~df_features["exclude"]].shape[0], df_features[~df_features["exclude"]].shape[0] / df_features.shape[0]

(432042, 0.5546459276642561)

# make sure bad ones are caught

In [9]:
# check a few at random

df_features.sample().T

Unnamed: 0,772446
pub_number,SSR 2003/017
fr,Ce problème est lié principalement à la limite...
en,This problem is mainly related to the 42 cm li...
similarity,0.767
len_ratio,1.133929
verb_ratio,1.0
noun_ratio,1.0
entity_ratio,0.5
clause_ratio,1.0
exclude_low_similarity,False


In [10]:
# this one looks good but will be excluded based on verb ratio and entity ratio 
#  maybe false negatives are OK if quality improves

df_features.loc[485273]

pub_number                                                     SAR 2009/027
fr                        Selon les résultats des deux relevés post-sais...
en                        Based on the results from the two postseason s...
similarity                                                            0.813
len_ratio                                                          0.988571
verb_ratio                                                         0.666667
noun_ratio                                                            1.125
entity_ratio                                                           0.25
clause_ratio                                                            1.0
exclude_low_similarity                                                False
exclude_len_ratio                                                     False
exclude_verb_ratio                                                     True
exclude_noun_ratio                                                    False
exclude_enti

In [11]:
# this one is almost caught by len_ratio and entity_ratio, but not quite unless lower limits are tweaked

df_features.loc[498816]

pub_number                                                     SAR 2010/020
fr                        Sources d incertitude Il existe de l incertitu...
en                        Newfoundland and Labrador Region 2HJ3KLNOPs4R ...
similarity                                                            0.818
len_ratio                                                          0.771084
verb_ratio                                                              1.0
noun_ratio                                                            1.125
entity_ratio                                                            0.2
clause_ratio                                                            1.0
exclude_low_similarity                                                False
exclude_len_ratio                                                     False
exclude_verb_ratio                                                    False
exclude_noun_ratio                                                    False
exclude_enti

In [12]:
# this one is good, but is excluded by entity_ratio if 0.33 is used as a lower limit

df_features.loc[366969]

pub_number                                                     RES 2022/027
fr                        De 2018 à 2021, plus de 80 de la biomasse cumu...
en                        From 2018 to 2021, more than 80 of the cumulat...
similarity                                                            0.972
len_ratio                                                          0.989796
verb_ratio                                                              1.0
noun_ratio                                                         1.285714
entity_ratio                                                           0.25
clause_ratio                                                            2.0
exclude_low_similarity                                                False
exclude_len_ratio                                                     False
exclude_verb_ratio                                                    False
exclude_noun_ratio                                                    False
exclude_enti

In [13]:
# this one is junk data, it is caught by multiple exclusion criteria

df_features.loc[574405]

pub_number                                                     SAR 2017/033
fr                        Saison Permis TAC (t) Débarquements (t) CPUE (...
en                        Season Licenses TAC (t) Landings (t) CPUE (kg ...
similarity                                                            0.781
len_ratio                                                          1.101205
verb_ratio                                                              3.0
noun_ratio                                                         4.636364
entity_ratio                                                       0.115385
clause_ratio                                                            7.5
exclude_low_similarity                                                False
exclude_len_ratio                                                     False
exclude_verb_ratio                                                     True
exclude_noun_ratio                                                     True
exclude_enti

# OK this looks good. let's do it!

In [15]:
df_clean = df_features[~df_features.exclude].copy()

save_jsonl(df_clean, "training_data.jsonl")

In [16]:
import os

path = "training_data.jsonl"
size_kb = os.path.getsize(path) / 1024
line_count = sum(1 for _ in open(path, 'r', encoding='utf-8'))

print(f"{size_kb:.2f} KB, {line_count} lines")

304540.49 KB, 864084 lines
