In [1]:
import re
import pandas as pd

from create_jsonl import save_jsonl

# pandas formatting
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# the same dataset after cleaning
df_features = pd.read_pickle("df_with_more_features.pickle")

percentiles = [0.00135, 0.02275, 0.1587, 0.8413, 0.97725, 0.99865]
columns = ['50%', '0.1%', '2.3%', '15.9%', '84.1%', '97.7%', '99.9%']

# Exclusions

In [2]:
df_features.describe(percentiles).loc[columns].T

Unnamed: 0,50%,0.1%,2.3%,15.9%,84.1%,97.7%,99.9%
similarity,0.85,0.7,0.71,0.76,0.92,0.96,0.98
len_ratio,1.21,0.34,0.75,1.05,1.42,1.92,3.93
verb_ratio,1.0,0.25,0.5,0.75,1.5,3.0,5.0
noun_ratio,1.2,0.38,0.67,1.0,1.75,4.0,12.0
entity_ratio,0.75,0.1,0.17,0.33,1.0,2.0,4.0
clause_ratio,1.0,0.2,0.5,1.0,1.5,3.0,6.0
one_char_words_fr,0.0,0.0,0.0,0.0,1.0,3.0,11.0
one_char_words_en,0.0,0.0,0.0,0.0,1.0,3.0,11.0
potential_fr_accent_issues,0.0,0.0,0.0,0.0,0.0,1.0,2.0


In [3]:
# exclude similarity < 0.757 (1 stdev below median)

# when similarity < 0.85 (median)
outlier_criteria_s1 = {
    "len_ratio": (0.75, 1.92),  # override with 2 stdev len ratios
    "verb_ratio": (0.75, 1.50),
    "noun_ratio": (1.00, 1.75),
    "entity_ratio": (0.33, 1.00),
    "clause_ratio": (1.00, 1.50),
    "one_char_words_fr": (0.0, 1.0),
    "one_char_words_en": (0.0, 1.0),
    "potential_fr_accent_issues": (0.0, 0.1),  # exclude anything above 0
}

# when similarity < 0.92 (1 stdev above median)
outlier_criteria_s2 = {
    "len_ratio": (0.75, 1.92),
    "verb_ratio": (0.50, 3.00),
    "noun_ratio": (0.67, 4.00),
    "entity_ratio": (0.17, 2.00),
    "clause_ratio": (0.50, 3.00),
    "one_char_words_fr": (0.0, 3.0),
    "one_char_words_en": (0.0, 3.0),
    "potential_fr_accent_issues": (0.0, 0.1),  # exclude anything above 0
}

# all higher similarities
outlier_criteria_s3 = {
    "len_ratio": (0.34, 3.93),
    "verb_ratio": (0.25, 5.00),
    "noun_ratio": (0.38, 12.00),
    "entity_ratio": (0.10, 4.00),
    "clause_ratio": (0.20, 6.00),
    "one_char_words_fr": (0.0, 11.0),
    "one_char_words_en": (0.0, 11.0),
    "potential_fr_accent_issues": (0.0, 0.1),  # exclude anything above 0
}

In [4]:
for outlier_criteria in [outlier_criteria_s1, outlier_criteria_s2, outlier_criteria_s3]:
    print()
    for feature, (x, y) in outlier_criteria.items():
        feature.replace('potential_fr_accent_issues', 'fr_accent_issues')
        print(f"{feature:<20}", end="")
        print(f"number below {x}:\t{df_features[df_features[feature] < x].shape[0]:<10}", end="\t\t")
        print(f"number above {y}:\t{df_features[df_features[feature] > y].shape[0]:<10}")


len_ratio           number below 0.75:	17869     		number above 1.92:	17871     
verb_ratio          number below 0.75:	107322    		number above 1.5:	108955    
noun_ratio          number below 1.0:	104712    		number above 1.75:	116633    
entity_ratio        number below 0.33:	79250     		number above 1.0:	84643     
clause_ratio        number below 1.0:	120154    		number above 1.5:	107848    
one_char_words_fr   number below 0.0:	0         		number above 1.0:	59487     
one_char_words_en   number below 0.0:	0         		number above 1.0:	52257     
potential_fr_accent_issuesnumber below 0.0:	0         		number above 0.1:	22553     

len_ratio           number below 0.75:	17869     		number above 1.92:	17871     
verb_ratio          number below 0.5:	12007     		number above 3.0:	6404      
noun_ratio          number below 0.67:	17769     		number above 4.0:	14641     
entity_ratio        number below 0.17:	19930     		number above 2.0:	11003     
clause_ratio        number below 0.

In [5]:
df_features["exclude_low_similarity"] = df_features["similarity"] < 0.757

s1_mask = df_features["similarity"] < 0.85
s2_mask = (df_features["similarity"] >= 0.85) & (df_features["similarity"] < 0.92)
s3_mask = df_features["similarity"] >= 0.92
criteria = {
    "s1": outlier_criteria_s1,
    "s2": outlier_criteria_s2,
    "s3": outlier_criteria_s3,
}

for feature in outlier_criteria_s1:
    col_name = f"exclude_{feature}"
    df_features[col_name] = False

    low1, high1 = outlier_criteria_s1[feature]
    low2, high2 = outlier_criteria_s2[feature]
    low3, high3 = outlier_criteria_s3[feature]

    df_features.loc[s1_mask, col_name] = ~df_features.loc[s1_mask, feature].between(low1, high1)
    df_features.loc[s2_mask, col_name] = ~df_features.loc[s2_mask, feature].between(low2, high2)
    df_features.loc[s3_mask, col_name] = ~df_features.loc[s3_mask, feature].between(low3, high3)


# exclude figure and table text

In [6]:
def analyze_text_for_figrefs(text, language='en'):
    result = {
        'has_trailing_numbers': False,
        'has_parenthetical_numbers': False,
        'has_figure_references': False,
        'has_repeated_punctuation': False,
        'exclude_figtext': False
    }

    # Check for trailing numbers
    if re.search(r'\s+\d+\s*$', text):
        result['has_trailing_numbers'] = True

    # Check for parenthetical numbers
    if re.search(r'\s+\(\d+\)\s*$', text):
        result['has_parenthetical_numbers'] = True

    # Check for figure/table references (with French support)
    if language == 'fr':
        # French patterns: Figure, Tableau, Fig., Tab.
        pattern = r'\s*(?:Figure|Tableau|Fig\.?|Tab\.?)\s+\d+.*$'
    else:
        # English patterns: Figure, Table, Fig., Tab.
        pattern = r'\s*(?:Figure|Table|Fig\.?|Tab\.?)\s+\d+.*$'

    if re.search(pattern, text, flags=re.IGNORECASE):
        result['has_figure_references'] = True

    # Check for repeated punctuation
    if re.search(r'[.!?]{2,}$', text):
        result['has_repeated_punctuation'] = True

    # Set exclude flag if any issue found
    result['exclude_figtext'] = any([
        result['has_figure_references'],
        result['has_trailing_numbers'],
        result['has_parenthetical_numbers'],
        result['has_repeated_punctuation']
    ])

    return result

def process_bilingual_dataframe(df, text_en_column='text_en', text_fr_column='text_fr'):
    en_results = df[text_en_column].apply(lambda x: analyze_text_for_figrefs(x, language='en'))
    fr_results = df[text_fr_column].apply(lambda x: analyze_text_for_figrefs(x, language='fr'))

    df['has_trailing_numbers_en'] = en_results.apply(lambda x: x['has_trailing_numbers'])
    df['has_parenthetical_numbers_en'] = en_results.apply(lambda x: x['has_parenthetical_numbers'])
    df['has_figure_references_en'] = en_results.apply(lambda x: x['has_figure_references'])
    df['has_repeated_punctuation_en'] = en_results.apply(lambda x: x['has_repeated_punctuation'])
    df['exclude_figtext_en'] = en_results.apply(lambda x: x['exclude_figtext'])

    df['has_trailing_numbers_fr'] = fr_results.apply(lambda x: x['has_trailing_numbers'])
    df['has_parenthetical_numbers_fr'] = fr_results.apply(lambda x: x['has_parenthetical_numbers'])
    df['has_figure_references_fr'] = fr_results.apply(lambda x: x['has_figure_references'])
    df['has_repeated_punctuation_fr'] = fr_results.apply(lambda x: x['has_repeated_punctuation'])
    df['exclude_figtext_fr'] = fr_results.apply(lambda x: x['exclude_figtext'])

    df['exclude_figtext'] = df['exclude_figtext_en'] | df['exclude_figtext_fr']

    return df

def process_bilingual_dataframe_simple(df, text_en_column='en', text_fr_column='fr'):
    # Analyze both columns
    en_results = df[text_en_column].apply(lambda x: analyze_text_for_figrefs(x, language='en'))
    fr_results = df[text_fr_column].apply(lambda x: analyze_text_for_figrefs(x, language='fr'))

    # Add only the combined flag
    df['exclude_figtext'] = (
        en_results.apply(lambda x: x['exclude_figtext']) |
        fr_results.apply(lambda x: x['exclude_figtext'])
    )

    return df

In [7]:
df_features = process_bilingual_dataframe_simple(df_features)

In [8]:
df_features[df_features.exclude_figtext].shape[0]

141551

# how many sentences have years or months in them?
### exclude these as well?

In [9]:
df_features['has_date_refs'] = df_features[['en', 'fr']].apply(lambda x: x.astype(str).str.contains(r'\b(?:19|20)\d{2}\b|(?:January|February|March|April|May|June|July|August|September|October|November|December|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b', case=False, regex=True).any(), axis=1)

In [10]:
df_features[df_features['has_date_refs']].shape[0]

262138

# exclude all (270k is plenty of training data)

In [11]:
exclusion_columns = [
    'exclude_low_similarity',
    'exclude_len_ratio',
    'exclude_verb_ratio',
    'exclude_noun_ratio',
    'exclude_entity_ratio',
    'exclude_clause_ratio',
    'exclude_one_char_words_fr',
    'exclude_one_char_words_en',
    'exclude_figtext',
    'has_date_refs',
    'potential_fr_accent_issues',
]
df_features["exclude"] = df_features[exclusion_columns].any(axis=1)

In [12]:
# how many exclusions does this lead to?

df_features[exclusion_columns + ['exclude']].sum()

exclude_low_similarity        121566
exclude_len_ratio              35269
exclude_verb_ratio            125555
exclude_noun_ratio            141034
exclude_entity_ratio           99526
exclude_clause_ratio          131022
exclude_one_char_words_fr      37056
exclude_one_char_words_en      32075
exclude_figtext               141551
has_date_refs                 262138
potential_fr_accent_issues     24079
exclude                       517944
dtype: int64

In [13]:
# how many translations are left after this cleaning? (half a million)

df_features[~df_features["exclude"]].shape[0], df_features[~df_features["exclude"]].shape[0] / df_features.shape[0]

(261007, 0.3350749918801054)

# make sure bad ones are caught

In [14]:
# check a few at random

df_features.sample().T

Unnamed: 0,457040
pub_number,RES 2024/011
fr,"L inventaire des macronutriments, principalement le silicate et le nitrate, sont des facteurs se..."
en,"The inventory of macronutrients, principally silicate and nitrate, are important secondary facto..."
similarity,0.94
len_ratio,1.12
verb_ratio,1.00
noun_ratio,1.00
entity_ratio,1.00
clause_ratio,1.00
one_char_words_fr,0


In [15]:
# this one looks good but will be excluded based on verb ratio and entity ratio 
#  maybe false negatives are OK if quality improves

df_features.loc[485273]

pub_number                                                                                                                   SAR 2009/027
fr                                    Selon les résultats des deux relevés post-saison, l'indice d'abondance des crabes commerciaux re...
en                                    Based on the results from the two postseason surveys, the abundance index of commercial crabs re...
similarity                                                                                                                           0.81
len_ratio                                                                                                                            0.99
verb_ratio                                                                                                                           0.67
noun_ratio                                                                                                                           1.12
entity_ratio                      

In [16]:
# this one is almost caught by len_ratio and entity_ratio, but not quite unless lower limits are tweaked

df_features.loc[498816]

pub_number                                                                                                                   SAR 2010/020
fr                                    Sources d'incertitude Il existe de l'incertitude quant aux effets que des changements apportés à...
en                                    Newfoundland and Labrador Region 2HJ3KLNOPs4R Snow Crab 40 Sources of Uncertainty There is uncer...
similarity                                                                                                                           0.82
len_ratio                                                                                                                            0.77
verb_ratio                                                                                                                           1.00
noun_ratio                                                                                                                           1.12
entity_ratio                      

In [17]:
# this one is good, but is excluded by entity_ratio if 0.33 is used as a lower limit

df_features.loc[366969]

pub_number                                                                                                                   RES 2022/027
fr                                    De 2018 à 2021, plus de 80 de la biomasse cumulée était retrouvée entre 164 et 302 m'à des tempé...
en                                    From 2018 to 2021, more than 80 of the cumulative biomass was found between 164 and 302 m at bot...
similarity                                                                                                                           0.97
len_ratio                                                                                                                            0.99
verb_ratio                                                                                                                           1.00
noun_ratio                                                                                                                           1.29
entity_ratio                      

In [18]:
# this one is junk data, it is caught by multiple exclusion criteria

df_features.loc[574405]

pub_number                                                                                                                   SAR 2017/033
fr                                    Saison Permis TAC (t) Débarquements (t) CPUE (kg casier levé) Effort (x 1 000 casiers levés) 200...
en                                    Season Licenses TAC (t) Landings (t) CPUE (kg trap haul) Effort (x1000 trap hauls) 2005 06 9 337...
similarity                                                                                                                           0.78
len_ratio                                                                                                                            1.10
verb_ratio                                                                                                                           3.00
noun_ratio                                                                                                                           4.64
entity_ratio                      

In [19]:
df_features[df_features.fr == "0 1 - d o O U .- a t a Y O C d ' W I 1 I 1 N 1 N 1 N I N C U ) 1 N 1 1 N 1 M 1 N N I O M U ) I O Y O d l - - ' N r C l ) V) j L !,1 4 4 0 L O n C C ) m a o 7 F N ) ( V N N 1 N I N M"].T

pub_number
fr
en
similarity
len_ratio
verb_ratio
noun_ratio
entity_ratio
clause_ratio
one_char_words_fr
one_char_words_en


# OK this looks good. let's do it!

### first add periods to all sentences to make sure not to misalign the model

In [20]:
df_clean = df_features[~df_features.exclude].copy()

df_clean['fr'] = df_clean['fr'] + "."
df_clean['en'] = df_clean['en'] + "."

### then save the file(s)

In [41]:
save_jsonl(df_clean, "training_data.jsonl")

In [42]:
import os

path = "training_data.jsonl"
size_kb = os.path.getsize(path) / 1024
line_count = sum(1 for _ in open(path, 'r', encoding='utf-8'))

print(f"{size_kb:.2f} KB, {line_count} lines")

180158.08 KB, 522014 lines


# also save the cleanish data for checking afterwards
* includes 3 stdev before outlier exclusion
* includes dates
* includes fig and table text

In [43]:
for feature, (low, high) in outlier_criteria_s3.items():
    col_name = f"exclude_relaxed_{feature}"
    df_features[col_name] = False
    df_features[col_name] = ~df_features[feature].between(low, high)
    
exclusion_relaxed_columns = [
    'exclude_relaxed_len_ratio',
    'exclude_relaxed_verb_ratio',
    'exclude_relaxed_noun_ratio',
    'exclude_relaxed_entity_ratio',
    'exclude_relaxed_clause_ratio',
    'exclude_relaxed_one_char_words_fr',
    'exclude_relaxed_one_char_words_en',
]

df_features["exclude_relaxed"] = df_features[exclusion_relaxed_columns].any(axis=1)
df_clean_relaxed = df_features[~df_features["exclude_relaxed"] & df_features["exclude"]]
df_clean_relaxed.shape[0]

509271

In [44]:
save_jsonl(df_clean_relaxed, "testing_data.jsonl")