In [1]:
import json
import os
import random
import re
import subprocess
import numpy as np
import pandas as pd

from collections import Counter

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [2]:
# all files that have been downloaded and parsed
parsed_docs_folder = os.path.join("..", "ParsedPublications")

min_year = 2023
parsed_files = list()
parsed_files_with_hq_ocr = list()
for folder in os.listdir(parsed_docs_folder):
    path = os.path.join(parsed_docs_folder, folder)
    if os.path.isdir(path):
        for json_file in os.listdir(path):
            if json_file.endswith(".json"):
                parsed_files.append(json_file.replace('.json', ''))
                if folder in [str(year) for year in range(min_year, 2024 + 1)]:
                    parsed_files_with_hq_ocr.append(json_file.replace('.json', ''))

# all files from website
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"
fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)
# exclude files that aren't downloaded, and files that have been withdrawn
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en.isin(parsed_files)) | (fr_eng_correlation_df.filename_fr.isin(parsed_files))]
fr_eng_correlation_df = fr_eng_correlation_df[(fr_eng_correlation_df.filename_en != 'WITHDRAWN') & (fr_eng_correlation_df.filename_fr != 'WITHDRAWN')]

# weblinks for previewing / checking results
weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

# data to be used for language classifier
lang_df = fr_eng_correlation_df.copy()
lang_df = lang_df[(lang_df.filename_fr.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_en.isin(parsed_files_with_hq_ocr)) & (lang_df.filename_fr != lang_df.filename_en)]


# helper functions

In [6]:
def preview_publication(pub_number):
    if type(pub_number) is pd.DataFrame and pub_number.shape[0] == 1:
        try:
            pub_number = pub_number['pub_number'].values[0]
        except ValueError:
            return None
    elif type(pub_number) is pd.Series:
        try:
            pub_number = pub_number.values[0]
        except ValueError:
            return None
    
    try:
        output_df = weblinks_df[weblinks_df.pub_number == pub_number].T
    except Exception as e:
        print(e)
        return None
        
    return output_df


def get_filepaths(row, min_year=2023):
    fr_filename, en_filename = row['filename_fr'] + '.json', row['filename_en'] + '.json'
    file_folders = [os.path.join('..', 'ParsedPublications', str(year)) for year in range(min_year, 2024 + 1)]
    
    try:
        fr_path, en_path = ([os.path.join(folder, fr_filename) for folder in file_folders if os.path.exists(os.path.join(folder, fr_filename))][0], 
                            [os.path.join(folder, en_filename) for folder in file_folders if os.path.exists(os.path.join(folder, en_filename))][0])
    except IndexError:
        return None, None
    
    return fr_path, en_path


In [62]:
# # set method (doesn't exclude extremely rare words)
# 
# import re
# import json
# 
# french_words = set()
# english_words = set()
# 
# # Make lists of all French words and all English words
# for i, row in lang_df.iterrows():
#     fr_path, en_path = get_filepaths(row)
#     
#     with open(fr_path, 'r', encoding='utf-8') as file:
#         fr_text = json.load(file).get('text', '')
#         french_words |= set(fr_text.split())
#     
#     with open(en_path, 'r', encoding='utf-8') as file:
#         en_text = json.load(file).get('text', '')
#         english_words |= set(en_text.split())
# 
# # Remove non-words using regex
# non_word_characters = re.compile(r'[^a-zA-ZÀ-ÿ]')
# french_words = {word for word in french_words if not non_word_characters.search(word)}
# english_words = {word for word in english_words if not non_word_characters.search(word)}
# 
# # Remove empty strings
# french_words.discard('')
# english_words.discard('')
# 
# # Remove overlapping words
# overlapping_words = english_words & french_words
# english_words.difference_update(overlapping_words)
# french_words.difference_update(overlapping_words)
# 
# # Remove numeric-only words
# english_words.difference_update({w for w in english_words if w.isnumeric()})
# french_words.difference_update({w for w in french_words if w.isnumeric()})


In [103]:
# TODO: make sure headers and footers aren't bilingual or delete if possible
# header is bilingual - could look for table of content or skip some stuff

lang_df.sample(1).T  

Unnamed: 0,1237
pub_number,PRO 2023/046
year,2023
nom,"Compte rendu de l'examen régional par les pairs de l'évaluation de la fiabilité des procédures de gestion de la morue charbonnière (Anoplopoma fimbria) en Colombie-Britannique, au Canada, 2017-201..."
name,"Proceedings of the regional peer review evaluating the robustness of management procedures for the Sablefish (Anoplopoma fimbria) fishery in British Columbia, Canada for 2017-18; January 10, 2017"
url_fr,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2023/2023_046-fra.html
url_en,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2023/2023_046-eng.html
filename_fr,41221084.pdf
filename_en,41220006.pdf
file_url_fr,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41221084.pdf
file_url_en,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41220006.pdf


In [104]:
lang_df.sample(1).T 

Unnamed: 0,6310
pub_number,RES 2024/036
year,2024
nom,Évaluation de la crevette nordique (Pandalus borealis) et de la crevette ésope (Pandalus montagui) dans la zone de pêche à la crevette 4 en 2022
name,Assessment of Northern Shrimp (Pandalus borealis) and Striped Shrimp (Pandalus montagui) in Shrimp Fishing Area 4 in 2022
url_fr,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_036-fra.html
url_en,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_036-eng.html
filename_fr,41249082.pdf
filename_en,41249070.pdf
file_url_fr,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41249082.pdf
file_url_en,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41249070.pdf


In [106]:
# how many times does "RÉFÉRENCES CITÉES" happen in fr docs? always 2? splitting there would work if so
lang_df

In [112]:
toc_info

[(0, 0),
 (3, 3),
 (3, 3),
 (3, 3),
 (0, 0),
 (0, 0),
 (3, 3),
 (3, 3),
 (3, 4),
 (3, 3),
 (0, 0),
 (3, 3),
 (0, 0),
 (0, 0),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (0, 0),
 (3, 3),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (3, 4),
 (0, 0),
 (4, 4),
 (4, 4),
 (3, 3),
 (0, 0),
 (3, 3),
 (3, 3),
 (0, 0),
 (0, 0),
 (0, 0),
 (3, 3),
 (0, 0),
 (3, 3),
 (3, 0),
 (0, 0),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (0, 0),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (0, 0),
 (2, 3),
 (3, 3),
 (0, 0),
 (0, 0),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (15, 15),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 0),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (0, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (0, 3),
 (3, 3),
 (0, 3),
 (0, 3),
 (3, 3),
 (3, 3),

In [218]:
# check headers and toc calcs by inspection
#  it looks like OCR never gets the biligual stuff on the header, so header may not need trimming
#  references often have other laguage stuff in them, so splitting by toc

def inspect_text(show_split=True):    
    references_fr = r'RÉFÉRENCES CITÉES'.lower()
    references_en = r'REFERENCES CITED'.lower()
    
    for i, row in lang_df.iterrows():
        fr_path, en_path = get_filepaths(row)
        split = False
        
        with open(fr_path, 'r', encoding='utf-8') as file:
            fr_text = json.load(file).get('text', '')
            
            parts = re.split(references_fr, fr_text.lower(), flags=re.IGNORECASE)
            if 2 < len(parts) < 5:
                fr_text = parts[1]
                split = True
        
        with open(en_path, 'r', encoding='utf-8') as file:
            en_text = json.load(file).get('text', '')
            
            parts = re.split(references_en, en_text.lower(), flags=re.IGNORECASE)
            if 2 < len(parts) < 5:  # if 2 or 3 occurences of references text, take the second part (to get the main body text)
                en_text = parts[1]
                split = True
        
        if show_split == split:
            display(pd.DataFrame(row))
            yield fr_text, en_text
            

In [219]:
x = inspect_text(True)

In [220]:
n = 2000
y = next(x)
print(re.sub(r'\s+', ' ', y[0][:n]).strip())

Unnamed: 0,1196
pub_number,PRO 2023/003
year,2023
nom,"Compte rendu de l'examen par les pairs de la région du Pacifique sur le Proposition de cadre de surveillance de la zone de protection marine du mont sous-marin SGaan Kinghlas-Bowie, en Colombie-Br..."
name,"Proceedings of the Pacific regional peer review on the proposed monitoring framework for SG̲aan K̲inghlas-Bowie Seamount Marine Protected Area, British Columbia, Canada; May 3-5, 2022"
url_fr,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2023/2023_003-fra.html
url_en,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2023/2023_003-eng.html
filename_fr,41097117.pdf
filename_en,41097257.pdf
file_url_fr,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41097117.pdf
file_url_en,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41097257.pdf


............................................................................................................... 9 annexe a : cadre de référence ..................................................................................... 11 annexe b : résumé du document de travail ............................................................. 14 annexe c : ordre du jour ................................................................................................. 15 annexe d : participants aux réunions ........................................................................ 18 annexe e : révisions majeures convenus du document de travail ............... 20 iv sommaire le présent compte rendu résume les discussions pertinentes et les principales conclusions de la réunion régionale d’examen par les pairs du secrétariat canadien des avis scientifiques (scas) de pêches et océans canada (mpo) qui a eu lieu du 3 au 5 mai 2022 sur la plateforme de réunion virtuelle zoom. le document de travail prés

In [221]:
print(re.sub(r'\s+', ' ', y[1][:n]).strip())

.................................................................................................................. 7 appendix a: terms of reference ..................................................................................... 9 appendix b: working paper abstract ......................................................................... 12 appendix c: agenda ............................................................................................................. 13 appendix d: meeting participants ................................................................................. 15 appendix e: agreed upon major revisions to the working paper .................. 16 iv summary these proceedings summarize the relevant discussions and key conclusions that resulted from a fisheries and oceans canada (dfo) canadian science advisory secretariat (csas) regional peer review meeting on may 3-5, 2022 via the online meeting platform zoom. the working paper presented for peer review focused

In [222]:
print(re.sub(r'\s+', ' ', y[0][-n:]).strip())

le tableau seront incorporés dans la version finale du tableau qui sera incluse dans le document de recherche. 9 recommandations : au cours de l’élaboration de l’as, le groupe a élaboré des recommandations et des conclusions claires de la réunion qui seront également incluses dans le document de recherche. cela permettra de répondre à la demande d’un examinateur qui souhaitait une communication plus claire pour les décideurs. élaboration de l’avis scientifique un participant se retire au cours de l’élaboration de l’as en invoquant des préoccupations concernant le processus casp. les participants sont déçus que ce participant ne se sente pas à l’aise de continuer, et reconnaissent les importantes contributions qu’il a apportées jusqu’à son départ. après une conversation approfondie avec le groupe, on s’entend pour continuer à développer l’as. le groupe convient à l’unanimité que le départ du participant ne compromet pas la validité scientifique ou l’intégrité du document de recherche en

In [223]:
print(re.sub(r'\s+', ' ', y[1][-n:]).strip())

objective. the authors will explore various options for displaying the information in the table. the authors will also include text stating that these current/best options will need to be revisited following baseline and long- term monitoring. since this table will also be included in the sar, participant feedback on the table will be incorporated into the final version of the table for inclusion in the research document. recommendations: during sar development the group developed clear recommendations and conclusions from the meeting that will also be included in the research document. this will address a reviewer's request for clearer communication for decision makers. sar development a participant withdrew themselves during the sar development citing concerns over the csap process. participants were disappointed this participant did not feel comfortable continuing, and acknowledged the important contributions they made right up until their departure. after a thorough conversation wi

In [114]:
r'RÉFÉRENCES CITÉES'.lower()

'références citées'

In [108]:
x = 'this is a thing this this'
x.count('this')

3

In [113]:
# list method (excludes extremely rare words)

valid_word_regex = re.compile(r'^[a-zA-ZÀ-ÿ]+$')
french_word_list = []
english_word_list = []
exclude_words_with_less_than_n = 10

# clean headers and appendices
toc_fr = r'RÉFÉRENCES CITÉES'.lower()
toc_en = r'REFERENCES CITED'.lower()
chop_header_char = 200

# Make lists of all French words and all English words
for i, row in lang_df.iterrows():
    fr_path, en_path = get_filepaths(row)
    
    with open(fr_path, 'r', encoding='utf-8') as file:
        fr_text = json.load(file).get('text', '')
        
        parts = re.split(toc_fr, fr_text.lower(), flags=re.IGNORECASE)
        if 2 < len(parts) < 5:  # if 2 or 3 occurences of toc text, take the second part (main body text)
            fr_text = parts[1]
        else:  # otherwise, chop the header
            fr_text = fr_text[chop_header_char:].lower()
        
        french_word_list.extend(word for word in fr_text.split() if valid_word_regex.match(word))
    
    with open(en_path, 'r', encoding='utf-8') as file:
        en_text = json.load(file).get('text', '')
        
        parts = re.split(toc_en, en_text.lower(), flags=re.IGNORECASE)
        if 2 < len(parts) < 5:  # if 2 or 3 occurences of toc text, take the second part (main body text)
            en_text = parts[1]
        else:  # otherwise, chop the header
            en_text = n_text[chop_header_char:].lower()
        
        english_word_list.extend(word for word in en_text.split() if valid_word_regex.match(word))
        
# For testing
french_word_counts = Counter(french_word_list)
english_word_counts = Counter(english_word_list)
full_french_word_list = french_word_list.copy()
full_english_word_list = english_word_list.copy()

# Remove words with less than 10 occurrences
french_word_list = [word for word, count in french_word_counts.items() if count >= exclude_words_with_less_than_n]
english_word_list = [word for word, count in english_word_counts.items() if count >= exclude_words_with_less_than_n]

# Convert to sets for further processing
french_words = set(french_word_list)
english_words = set(english_word_list)

# Remove overlapping words
overlapping_words = english_words & french_words
english_words.difference_update(overlapping_words)
french_words.difference_update(overlapping_words)

# Remove numeric-only words
english_words.difference_update({w for w in english_words if w.isnumeric()})
french_words.difference_update({w for w in french_words if w.isnumeric()})


In [101]:
test_wordlists('Total mortalities at age, based on survey data, are presented in Table', english_words, french_words)

english words: ['mortalities', 'presented']
french words: []
en_count=2, fr_count=0


In [98]:
# helper functions for word lists

def test_wordlists(text_block, english_words, french_words):
    en_count = sum(1 for word in text_block.split() if word in english_words)
    fr_count = sum(1 for word in text_block.split() if word in french_words)
    
    print('english words:', list(word for word in text_block.split() if word in english_words))
    print('french words:', list(word for word in text_block.split() if word in french_words))
    print(f'{en_count=}, {fr_count=}')    

def calc_percentile(n, word_counts):
    n_least_common = int(len(word_counts) * n)
    less_than_n = word_counts.most_common()[::-1][:n_least_common][::-1]
    print(f"{n*100:0.0f}th percentile: {less_than_n[0][1]}")
    
    expanded_counts = []
    for word, count in word_counts.items():
        expanded_counts.extend([count] * count)
        
    expanded_counts = sorted(expanded_counts, reverse=True)
    weighted_nth_percentile = int(np.percentile(expanded_counts, n))
    print(f"Weighted {n*100:0.0f}th percentile: {weighted_nth_percentile}")
    
    # examples
    print('EXAMPLES')
    n_examples = 5
    
    # examples = [x[0] for x in less_than_n[:n_least_common][::-1][:n_examples]]
    # examples_weighted = [k for k, v in word_counts.items() if v == n_examples][:n_examples]
    
    examples = random.sample([x[0] for x in less_than_n[:n_least_common][::-1]], n_examples)
    examples_weighted = random.sample([k for k, v in word_counts.items() if v == n_examples], n_examples)
    
    print(f"\t{n*100:0.0f}th percentile word examples: \t\t\t\t{', '.join(examples)}")
    print(f"\tWeighted {n*100:0.0f}th percentile word examples: \t{', '.join(examples_weighted)}")
    

In [100]:
print('FRENCH')
calc_percentile(.95, french_word_counts)
print('\nENGLISH')
calc_percentile(.95, english_word_counts)

FRENCH
95th percentile: 172
Weighted 95th percentile: 4
EXAMPLES
	95th percentile word examples: 				creees, maux, aérée, gam, symbiotic
	Weighted 95th percentile word examples: 	divergents, codage, advance, cost, rationalisation

ENGLISH
95th percentile: 242
Weighted 95th percentile: 4
EXAMPLES
	95th percentile word examples: 				pugwash, onwards, radiation, delicate, bony
	Weighted 95th percentile word examples: 	wyeth, stems, wrymouth, expended, suberites


In [46]:
len(french_word_counts), french_word_counts.total()

(37064, 3332369)

In [90]:
block = "This is an example sentence in English Ceci est une phrase d'exemple en français"

en_count = sum(1 for word in block.split() if word.lower() in english_words)
fr_count = sum(1 for word in block.split() if word.lower() in french_words)

print('english words:', list(word for word in block.split() if word in english_words))
print('french words:', list(word for word in block.split() if word in french_words))

english words: ['example', 'français']
french words: ['est', 'une']


In [91]:
max(french_word_counts.values())

275792

In [95]:
counts = sorted(french_word_counts.values(), reverse=True)

p_n = np.percentile(counts, 0.50)
print(f"percentile: {p_n}")

percentile: 1.0


In [107]:
for x in [1, 10, 100, 1000, 10000]:
    print(f'number of words that occur at least {x} times:  {len([word for word, count in french_word_counts.items() if count >= x])}')

number of words that occur at least 1 times:  37064
number of words that occur at least 10 times:  11153
number of words that occur at least 100 times:  2749
number of words that occur at least 1000 times:  362
number of words that occur at least 10000 times:  31


In [None]:
# TODO: figure out the optimal word list with p = 0.95 chance of at least one word occuring in a sentence of 10 words




















In [105]:
len([word for word, count in french_word_counts.items() if count <= 1000])

36702

In [96]:
word_nth = [word for word, count in french_word_counts.items() if count <= p_n]
print(f"Word(s) at nth percentile: {word_nth}")



In [98]:
french_word_counts

Counter({'de': 275792,
         'la': 158761,
         'et': 117853,
         'les': 113834,
         'des': 113144,
         'à': 75995,
         'du': 73397,
         'le': 64900,
         'dans': 52339,
         'en': 46585,
         'pour': 43331,
         'sur': 28408,
         'par': 28035,
         'que': 24558,
         'une': 23866,
         'a': 22910,
         'est': 21948,
         'au': 21063,
         'un': 19744,
         'été': 19459,
         'sont': 17782,
         'plus': 16126,
         'ont': 14776,
         'qui': 14484,
         'pêche': 14093,
         'données': 13991,
         'and': 12613,
         'pas': 11254,
         'ou': 11199,
         'aux': 11095,
         'avec': 10483,
         'of': 9905,
         'modèle': 8816,
         'ce': 8740,
         'il': 8701,
         'région': 8551,
         'être': 8062,
         'on': 7957,
         'comme': 7904,
         'figure': 7689,
         'zone': 7679,
         'biomasse': 7655,
         'relevé': 7636,
   