In [60]:
import re
import pandas as pd

from create_jsonl import save_jsonl

# pandas formatting
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 100)
pd.set_option('display.float_format', '{:.0f}'.format)

# the same dataset after cleaning
df_features = pd.read_pickle("df_with_features.pickle")

In [61]:
percentiles = [0.00135, 0.02275, 0.1587, 0.8413, 0.97725, 0.99865]
columns = ['50%', '0.1%', '2.3%', '15.9%', '84.1%', '97.7%', '99.9%']
df_features.describe(percentiles).loc[columns].T

Unnamed: 0,50%,0.1%,2.3%,15.9%,84.1%,97.7%,99.9%
similarity,1,1,1,1,1,1,1
len_ratio,1,0,1,1,1,2,4
verb_ratio,1,0,0,1,2,3,5
noun_ratio,1,0,1,1,2,4,12
entity_ratio,1,0,0,0,1,2,4
clause_ratio,1,0,0,1,2,3,6


In [62]:
# exclude similarity < 0.757 (1 stdev below median)

# when similarity < 0.85 (median)
outlier_criteria_s1 = {
    "len_ratio": (0.75, 1.92),  # override with 2 stdev len ratios
    "verb_ratio": (0.75, 1.50),
    "noun_ratio": (1.00, 1.75),
    "entity_ratio": (0.33, 1.00),
    "clause_ratio": (1.00, 1.50),
}

# when similarity < 0.92 (1 stdev above median)
outlier_criteria_s2 = {
    "len_ratio": (0.75, 1.92),
    "verb_ratio": (0.50, 3.00),
    "noun_ratio": (0.67, 4.00),
    "entity_ratio": (0.17, 2.00),
    "clause_ratio": (0.50, 3.00),
}

# all higher similarities
outlier_criteria_s3 = {
    "len_ratio": (0.34, 3.93),
    "verb_ratio": (0.25, 5.00),
    "noun_ratio": (0.38, 12.00),
    "entity_ratio": (0.10, 4.00),
    "clause_ratio": (0.20, 6.00),
}

In [63]:
for outlier_criteria in [outlier_criteria_s1, outlier_criteria_s2, outlier_criteria_s3]:
    print()
    for feature, (x, y) in outlier_criteria.items():
        print(f"{feature:<20}", end="")
        print(f"number below {x}:\t{df_features[df_features[feature] < x].shape[0]:<10}", end="\t\t")
        print(f"number above {y}:\t{df_features[df_features[feature] > y].shape[0]:<10}")


len_ratio           number below 0.75:	17869     		number above 1.92:	17871     
verb_ratio          number below 0.75:	107322    		number above 1.5:	108955    
noun_ratio          number below 1.0:	104712    		number above 1.75:	116633    
entity_ratio        number below 0.33:	79250     		number above 1.0:	84643     
clause_ratio        number below 1.0:	120154    		number above 1.5:	107848    

len_ratio           number below 0.75:	17869     		number above 1.92:	17871     
verb_ratio          number below 0.5:	12007     		number above 3.0:	6404      
noun_ratio          number below 0.67:	17769     		number above 4.0:	14641     
entity_ratio        number below 0.17:	19930     		number above 2.0:	11003     
clause_ratio        number below 0.5:	16753     		number above 3.0:	8048      

len_ratio           number below 0.34:	1024      		number above 3.93:	1047      
verb_ratio          number below 0.25:	599       		number above 5.0:	354       
noun_ratio          number below 0.38

In [64]:
df_features["exclude_low_similarity"] = df_features["similarity"] < 0.757

s1_mask = df_features["similarity"] < 0.85
s2_mask = (df_features["similarity"] >= 0.85) & (df_features["similarity"] < 0.92)
s3_mask = df_features["similarity"] >= 0.92
criteria = {
    "s1": outlier_criteria_s1,
    "s2": outlier_criteria_s2,
    "s3": outlier_criteria_s3,
}

for feature in outlier_criteria_s1:
    col_name = f"exclude_{feature}"
    df_features[col_name] = False

    low1, high1 = outlier_criteria_s1[feature]
    low2, high2 = outlier_criteria_s2[feature]
    low3, high3 = outlier_criteria_s3[feature]

    df_features.loc[s1_mask, col_name] = ~df_features.loc[s1_mask, feature].between(low1, high1)
    df_features.loc[s2_mask, col_name] = ~df_features.loc[s2_mask, feature].between(low2, high2)
    df_features.loc[s3_mask, col_name] = ~df_features.loc[s3_mask, feature].between(low3, high3)


# exclude figure and table text

In [65]:
exclusion_columns = [
    'exclude_low_similarity',
    'exclude_len_ratio',
    'exclude_verb_ratio',
    'exclude_noun_ratio',
    'exclude_entity_ratio',
    'exclude_clause_ratio',
]
df_features["exclude"] = df_features[exclusion_columns].any(axis=1)

In [66]:
# how many are left over after cleaning exclusions, but before removing table/fig text?
df_features[~df_features.exclude].shape[0]

432042

In [67]:

def analyze_text_for_figrefs(text, language='en'):
    result = {
        'has_trailing_numbers': False,
        'has_parenthetical_numbers': False,
        'has_figure_references': False,
        'has_repeated_punctuation': False,
        'exclude_figtext': False
    }

    # Check for trailing numbers
    if re.search(r'\s+\d+\s*$', text):
        result['has_trailing_numbers'] = True

    # Check for parenthetical numbers
    if re.search(r'\s+\(\d+\)\s*$', text):
        result['has_parenthetical_numbers'] = True

    # Check for figure/table references (with French support)
    if language == 'fr':
        # French patterns: Figure, Tableau, Fig., Tab.
        pattern = r'\s*(?:Figure|Tableau|Fig\.?|Tab\.?)\s+\d+.*$'
    else:
        # English patterns: Figure, Table, Fig., Tab.
        pattern = r'\s*(?:Figure|Table|Fig\.?|Tab\.?)\s+\d+.*$'

    if re.search(pattern, text, flags=re.IGNORECASE):
        result['has_figure_references'] = True

    # Check for repeated punctuation
    if re.search(r'[.!?]{2,}$', text):
        result['has_repeated_punctuation'] = True

    # Set exclude flag if any issue found
    result['exclude_figtext'] = any([
        result['has_figure_references'],
        result['has_trailing_numbers'],
        result['has_parenthetical_numbers'],
        result['has_repeated_punctuation']
    ])

    return result

def process_bilingual_dataframe(df, text_en_column='text_en', text_fr_column='text_fr'):
    en_results = df[text_en_column].apply(lambda x: analyze_text_for_figrefs(x, language='en'))
    fr_results = df[text_fr_column].apply(lambda x: analyze_text_for_figrefs(x, language='fr'))

    df['has_trailing_numbers_en'] = en_results.apply(lambda x: x['has_trailing_numbers'])
    df['has_parenthetical_numbers_en'] = en_results.apply(lambda x: x['has_parenthetical_numbers'])
    df['has_figure_references_en'] = en_results.apply(lambda x: x['has_figure_references'])
    df['has_repeated_punctuation_en'] = en_results.apply(lambda x: x['has_repeated_punctuation'])
    df['exclude_figtext_en'] = en_results.apply(lambda x: x['exclude_figtext'])

    df['has_trailing_numbers_fr'] = fr_results.apply(lambda x: x['has_trailing_numbers'])
    df['has_parenthetical_numbers_fr'] = fr_results.apply(lambda x: x['has_parenthetical_numbers'])
    df['has_figure_references_fr'] = fr_results.apply(lambda x: x['has_figure_references'])
    df['has_repeated_punctuation_fr'] = fr_results.apply(lambda x: x['has_repeated_punctuation'])
    df['exclude_figtext_fr'] = fr_results.apply(lambda x: x['exclude_figtext'])

    df['exclude_figtext'] = df['exclude_figtext_en'] | df['exclude_figtext_fr']

    return df

def process_bilingual_dataframe_simple(df, text_en_column='en', text_fr_column='fr'):
    # Analyze both columns
    en_results = df[text_en_column].apply(lambda x: analyze_text_for_figrefs(x, language='en'))
    fr_results = df[text_fr_column].apply(lambda x: analyze_text_for_figrefs(x, language='fr'))

    # Add only the combined flag
    df['exclude_figtext'] = (
        en_results.apply(lambda x: x['exclude_figtext']) |
        fr_results.apply(lambda x: x['exclude_figtext'])
    )

    return df

In [68]:
df_features = process_bilingual_dataframe_simple(df_features)

In [69]:
df_features.loc[
    df_features['exclude_figtext'],
    ['en', 'fr', 'exclude']
].sample(10)

Unnamed: 0,en,fr,exclude
400510,The size structure shows some modes located between 35 and 51 mm (Figure 6),La structure de taille affiche quelques modes situés entre 35 et 51 mm (Figure 6),False
169747,"With a catch of 6,763 t, the probability of an increase in biomass in 2003 would be less than 10","Pour des captures de 6 763 t, les probabilités qu il y ait un accroissement de biomasse en 2003 seraient de moins de 10",False
664708,Arrows indicate the 2013 cycle years,Les flèches indiquent les années du cycle 2013,False
250220,"For example, start of season CPUE of about 7 kg trap in 2017 was similar to end of season CPUE in 2015 and below the end of season CPUE of about 9 kg trap in 2014","Par exemple, la CPUE en début de saison, à environ 7 kg casier en 2017, était similaire à celle en fin de saison en 2015 et inférieure à la CPUE en fin de saison d environ 9 kg casier en 2014",False
777554,Recruitment declined to low levels in the 1970s and 1980s and to very low levels in the 1990s (Figure 7),Le recrutement a diminué pour atteindre des taux bas dans les années 1970 et 1980 et des taux très bas dans les années 1990 (figure 7),False
166711,"For example, the dominant 1988 year class was observed in 1990 in the size distributions for the line fishing catch, but not until 1991 and 1992 in the size distributions for the gillnet catch (Fi...",C est le cas en 1990 pour la classe d âge dominante de 1988 qui n a été observée dans les fréquences de taille des filets maillants qu en 1991 et 1992 (Figure 8),True
398860,Sites south of Lake Melville generally clustered separately from sites from Lake Melville and those northward at K 2,Les sites au sud du lac Melville étaient généralement regroupés séparément des sites du lac Melville et de ceux au nord pour K 2,False
718548,A steady biomass increase was then observed up until 2020,On a ensuite constaté une augmentation constante de la biomasse jusqu en 2020,False
203086,"Whelk landings, total allowable catch (TAC) and fishing effort from 1995 to 2011 in Fishing Area 13","Débarquements de buccins, total admissible des captures (TAC) et effort de pêche de 1995 à 2011 pour la zone de pêche 13",False
583735,"A total of 34 rivers were included in the project in which 85 sites were surveyed between August 11 and October 2, 2014","Au total, 34 rivières ont été incluses dans le projet, dans le cadre duquel 85 emplacements ont fait l objet d un relevé du 11 août au 2 octobre 2014",False


In [70]:
# total figtext percentage
df_features[df_features.exclude_figtext].shape[0] / df_features.shape[0]

0.181739287837104

In [71]:
# percentage of previously non-excluded rows with figtext
df_features[(df_features.exclude_figtext) & (~df_features.exclude)].shape[0] / df_features[~df_features.exclude].shape[0]

0.16495618481536517

In [72]:
exclusion_columns = [
    'exclude_low_similarity',
    'exclude_len_ratio',
    'exclude_verb_ratio',
    'exclude_noun_ratio',
    'exclude_entity_ratio',
    'exclude_clause_ratio',
    'exclude_figtext'
]
df_features["exclude"] = df_features[exclusion_columns].any(axis=1)


In [73]:
# how many are left over after removing table/fig text?
df_features[~df_features.exclude].shape[0]

360774

# how many sentences have years or months in them?
### exclude these as well?

In [74]:
df_features['has_date_refs'] = df_features[['en', 'fr']].apply(lambda x: x.astype(str).str.contains(r'\b(?:19|20)\d{2}\b|(?:January|February|March|April|May|June|July|August|September|October|November|December|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b', case=False, regex=True).any(), axis=1)

In [75]:
df_features[
    (df_features['has_date_refs']) & (~df_features.exclude)
].shape[0] / df_features[~df_features.exclude].shape[0]

0.24853509399236087

In [76]:
df_features[df_features['has_date_refs']].shape[0]

262138

In [77]:
exclusion_columns = [
    'exclude_low_similarity',
    'exclude_len_ratio',
    'exclude_verb_ratio',
    'exclude_noun_ratio',
    'exclude_entity_ratio',
    'exclude_clause_ratio',
    'exclude_figtext',
    'has_date_refs',
]
df_features["exclude"] = df_features[exclusion_columns].any(axis=1)

In [78]:
# how many are left over after removing dates and table/fig text?
df_features[~df_features.exclude].shape[0]

271109

# exclude all (270k is plenty of training data)

In [79]:
exclusion_columns = [
    'exclude_low_similarity',
    'exclude_len_ratio',
    'exclude_verb_ratio',
    'exclude_noun_ratio',
    'exclude_entity_ratio',
    'exclude_clause_ratio',
    'exclude_figtext',
    'has_date_refs',
]
df_features["exclude"] = df_features[exclusion_columns].any(axis=1)

In [80]:
# how many exclusions does this lead to?

df_features[exclusion_columns + ['exclude']].sum()

exclude_low_similarity    121566
exclude_len_ratio          35269
exclude_verb_ratio        125555
exclude_noun_ratio        141034
exclude_entity_ratio       99526
exclude_clause_ratio      131022
exclude_figtext           141566
has_date_refs             262138
exclude                   507842
dtype: int64

In [81]:
# how many translations are left after this cleaning? (half a million)

df_features[~df_features["exclude"]].shape[0], df_features[~df_features["exclude"]].shape[0] / df_features.shape[0]

(271109, 0.34804371520159805)

# make sure bad ones are caught

In [88]:
# check a few at random

df_features.sample().T

Unnamed: 0,175866
pub_number,RES 2004/054
fr,Les taux d exploitation moyens étaient élevés ( 50 ) pendant les trois années en cause (tableau ...
en,The average exploitation rates were high ( 50 ) in all three years (Table 5) and these rates are...
similarity,1
len_ratio,1
verb_ratio,1
noun_ratio,1
entity_ratio,0
clause_ratio,1
exclude_low_similarity,False


In [89]:
# this one looks good but will be excluded based on verb ratio and entity ratio 
#  maybe false negatives are OK if quality improves

df_features.loc[485273]

pub_number                                                                                                       SAR 2009/027
fr                        Selon les résultats des deux relevés post-saison, l indice d abondance des crabes commerciaux re...
en                        Based on the results from the two postseason surveys, the abundance index of commercial crabs re...
similarity                                                                                                                  1
len_ratio                                                                                                                   1
verb_ratio                                                                                                                  1
noun_ratio                                                                                                                  1
entity_ratio                                                                                                          

In [90]:
# this one is almost caught by len_ratio and entity_ratio, but not quite unless lower limits are tweaked

df_features.loc[498816]

pub_number                                                                                                       SAR 2010/020
fr                        Sources d incertitude Il existe de l incertitude quant aux effets que des changements apportés à...
en                        Newfoundland and Labrador Region 2HJ3KLNOPs4R Snow Crab 40 Sources of Uncertainty There is uncer...
similarity                                                                                                                  1
len_ratio                                                                                                                   1
verb_ratio                                                                                                                  1
noun_ratio                                                                                                                  1
entity_ratio                                                                                                          

In [91]:
# this one is good, but is excluded by entity_ratio if 0.33 is used as a lower limit

df_features.loc[366969]

pub_number                                                                                                       RES 2022/027
fr                        De 2018 à 2021, plus de 80 de la biomasse cumulée était retrouvée entre 164 et 302 m à des tempé...
en                        From 2018 to 2021, more than 80 of the cumulative biomass was found between 164 and 302 m at bot...
similarity                                                                                                                  1
len_ratio                                                                                                                   1
verb_ratio                                                                                                                  1
noun_ratio                                                                                                                  1
entity_ratio                                                                                                          

In [92]:
# this one is junk data, it is caught by multiple exclusion criteria

df_features.loc[574405]

pub_number                                                                                                       SAR 2017/033
fr                        Saison Permis TAC (t) Débarquements (t) CPUE (kg casier levé) Effort (x 1 000 casiers levés) 200...
en                        Season Licenses TAC (t) Landings (t) CPUE (kg trap haul) Effort (x1000 trap hauls) 2005 06 9 337...
similarity                                                                                                                  1
len_ratio                                                                                                                   1
verb_ratio                                                                                                                  3
noun_ratio                                                                                                                  5
entity_ratio                                                                                                          

# OK this looks good. let's do it!

In [93]:
df_clean = df_features[~df_features.exclude].copy()

save_jsonl(df_clean, "training_data.jsonl")

In [94]:
import os

path = "training_data.jsonl"
size_kb = os.path.getsize(path) / 1024
line_count = sum(1 for _ in open(path, 'r', encoding='utf-8'))

print(f"{size_kb:.2f} KB, {line_count} lines")

187191.67 KB, 542218 lines
