In [1]:
import json
import os
import random
import re
import pandas as pd

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

In [89]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"

fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)

weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

min_year = 2023
lang_df = fr_eng_correlation_df.copy()
lang_df = lang_df[(lang_df.year >= min_year) & (lang_df.filename_fr != lang_df.filename_en)]
# lang_df = lang_df[['year', 'pub_number', 'filename_fr', 'filename_en']]


In [None]:
# TODO: DELETE ONCE DONE TESTING

links_folder = "website_reports"

dataframes = []

for file in os.listdir(links_folder):
    if file.endswith(".xlsx"):
        file_path = os.path.join(links_folder, file)
        df = pd.read_excel(file_path)
        dataframes.append(df)

column_names = ['type', 'year', 'pub_number', '_', 'nom', 'name', 'url_fr', 'url_en', '_', '_', '_', '_']
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = column_names

types = ['RES', 'SAR', 'PRO', 'SSR', 'SCR', 'ESR', 'HSR']
combined_df = combined_df[combined_df.type.isin(types)]

# create formatted pub number with type and number
combined_df['pub_number'] = combined_df['type'] + " " + combined_df['pub_number']

columns = ['pub_number', 'year', 'nom', 'name', 'url_fr', 'url_en']
combined_df = combined_df[columns].reset_index(drop=True)

In [102]:
# TODO: DELETE ONCE DONE TESTING
  
parsed_docs_folder = os.path.join("..", "ParsedPublications")
data = []


for year_folder in os.listdir(parsed_docs_folder):
    min_year = 1977
    max_year = 2024
    
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            url = json_data.get('url')
                            year = json_data.get('publicationYear')
                            
                            data.append({
                                'filename': filename,
                                'url': url,
                                'year': year,
                            })

parsed_docs_df = pd.DataFrame(data)

Analysing: ..\ParsedPublications\1977
Analysing: ..\ParsedPublications\1978
Analysing: ..\ParsedPublications\1979
Analysing: ..\ParsedPublications\1980
Analysing: ..\ParsedPublications\1981
Analysing: ..\ParsedPublications\1982
Analysing: ..\ParsedPublications\1983
Analysing: ..\ParsedPublications\1984
Analysing: ..\ParsedPublications\1985
Analysing: ..\ParsedPublications\1986
Analysing: ..\ParsedPublications\1987
Analysing: ..\ParsedPublications\1988
Analysing: ..\ParsedPublications\1989
Analysing: ..\ParsedPublications\1990
Analysing: ..\ParsedPublications\1991
Analysing: ..\ParsedPublications\1992
Analysing: ..\ParsedPublications\1993
Analysing: ..\ParsedPublications\1994
Analysing: ..\ParsedPublications\1995
Analysing: ..\ParsedPublications\1996
Analysing: ..\ParsedPublications\1997
Analysing: ..\ParsedPublications\1998
Analysing: ..\ParsedPublications\1999
Analysing: ..\ParsedPublications\2000
Analysing: ..\ParsedPublications\2001
Analysing: ..\ParsedPublications\2002
Analysing: .

# helper functions

In [74]:
def preview_publication(pub_number):
    if type(pub_number) is pd.DataFrame and pub_number.shape[0] == 1:
        try:
            pub_number = pub_number['pub_number'].values[0]
        except ValueError:
            return None
    elif type(pub_number) is pd.Series:
        try:
            pub_number = pub_number.values[0]
        except ValueError:
            return None
    
    try:
        output_df = weblinks_df[weblinks_df.pub_number == pub_number].T
    except Exception as e:
        print(e)
        return None
        
    return output_df


def get_filepaths(row, min_year=2023):
    fr_filename, en_filename = row['filename_fr'] + '.json', row['filename_en'] + '.json'
    file_folders = [os.path.join('..', 'ParsedPublications', str(year)) for year in range(min_year, 2024 + 1)]
    
    try:
        fr_path, en_path = ([os.path.join(folder, fr_filename) for folder in file_folders if os.path.exists(os.path.join(folder, fr_filename))][0], 
                            [os.path.join(folder, en_filename) for folder in file_folders if os.path.exists(os.path.join(folder, en_filename))][0])
    except IndexError:
        return None, None
    
    return fr_path, en_path


In [100]:
all_correlated_lang_filenames = set(fr_eng_correlation_df.filename_fr.to_list() + fr_eng_correlation_df.filename_en.to_list())

all_pub_numbers = set(fr_eng_correlation_df.pub_number.to_list())
len(all_pub_numbers)

9058

In [101]:
# all pub_numbers from combined_df are linked fr to en 
#  this makes sense, because combined_df was the starting point for fr_eng_correlation_df
combined_df[~combined_df.pub_number.isin(all_pub_numbers)]

Unnamed: 0,pub_number,year,nom,name,url_fr,url_en


In [103]:
# filenames missing from parsed json data?

parsed_filenames = parsed_docs_df.filename.to_list()
all_lang_filenames = set(fr_eng_correlation_df.filename_fr.to_list() + fr_eng_correlation_df.filename_en.to_list())

In [106]:
not_in_lang = [x for x in parsed_filenames if x not in all_lang_filenames]
not_in_parsed = [x for x in all_lang_filenames if x not in parsed_filenames]

In [160]:
import difflib

closest_matches = dict()
for item in not_in_parsed:
    match = difflib.get_close_matches(item, not_in_lang, n=1, cutoff=0.8)
    closest_matches[item] = match[0] if match else None
    
for k in not_in_parsed:
    match = closest_matches[k]
    if match:
        print(k, match)

41272948.pdf 41020728.pdf
348570.pdf 348569.pdf
41272936.pdf 342736.pdf
41266122.pdf 41260272.pdf
40928743.pdf 41028430.pdf
41265609.pdf 41260260.pdf
41270782.pdf 41260272.pdf
41266079.pdf 41260272.pdf
41272651.pdf 41020765.pdf
40577053.pdf 40543705.pdf
40884296.pdf 41028429.pdf
41273059.pdf 41230796.pdf
40891240.pdf 41029124.pdf
41271142.pdf 41021174.pdf
41266213.pdf 41020613.pdf
41230516.pdf 41230796.pdf
2024_048-eng.pdf 2020_048-inu.pdf
331600.pdf 343100.pdf
40879227.pdf 40892657.pdf
41270678.pdf 41230784.pdf
41270757.pdf 41020765.pdf
41265907.pdf 41260272.pdf
41265920.pdf 41260260.pdf
2023_031-fra.pdf 2023_031-inu.pdf
41266080.pdf 41260260.pdf
41270745.pdf 41230784.pdf
41270794.pdf 41230796.pdf
41272882.pdf 41020728.pdf
41273047.pdf 41230796.pdf
41266067.pdf 41260272.pdf
41005958.pdf 41105588.pdf
2023_030-eng.pdf 2023_031-inu.pdf
41266110.pdf 41260260.pdf
41272961.pdf 41230796.pdf
41266134.pdf 41020613.pdf
41266055.pdf 41260260.pdf
41270630.pdf 41020613.pdf
41272808.pdf 41020728.pd

In [110]:
parsed_docs_df.sample(1).T

Unnamed: 0,6573
filename,341163.pdf
url,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/341163.pdf
year,2010


In [151]:
# many of the correlated files that not it parsed json data are withdrawn
df_missing_from_parsed_json_df = fr_eng_correlation_df[fr_eng_correlation_df.filename_fr.isin(not_in_parsed) | fr_eng_correlation_df.filename_en.isin(not_in_parsed)]

display(pd.DataFrame(df_missing_from_parsed_json_df.drop(['nom', 'name'], axis=1).sample(20)))

Unnamed: 0,pub_number,year,url_fr,url_en,filename_fr,filename_en,file_url_fr,file_url_en
6343,RES 2024/074,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_074-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_074-eng.html,41270885.pdf,41270873.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270885.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270873.pdf
1283,PRO 2024/041,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_041-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_041-eng.html,41265841.pdf,4126583x.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265841.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126583x.pdf
6352,RES 2024/028,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_028-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_028-eng.html,2024_028-fra.pdf,2024_028-eng.pdf,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_028-fra.pdf,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_028-eng.pdf
7640,SAR 2024/061,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_061-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_061-eng.html,41265919.pdf,41265907.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265919.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265907.pdf
1282,PRO 2024/040,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-eng.html,41265828.pdf,41265816.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265828.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265816.pdf
3388,RES 1998/040,1998,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-eng.htm,WITHDRAWN,WITHDRAWN,WITHDRAWN,WITHDRAWN
7637,SAR 2024/056,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_056-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_056-eng.html,41266134.pdf,41266122.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41266134.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41266122.pdf
6348,RES 2024/069,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_069-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_069-eng.html,41272638.pdf,41272626.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41272638.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41272626.pdf
3219,RES 1997/039,1997,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-eng.htm,WITHDRAWN,WITHDRAWN,WITHDRAWN,WITHDRAWN
1286,PRO 2024/045,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_045-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_045-eng.html,41270630.pdf,41270599.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270630.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270599.pdf


In [134]:
# many of the correlated files that not it parsed json data are withdrawn
display(pd.DataFrame(df_missing_from_parsed_json_df.drop(['nom', 'name'], axis=1)['filename_fr'].value_counts().head()))
print(f'total number of filenames not in correlation dataset (including withdrawn): {df_missing_from_parsed_json_df.shape[0]}')

Unnamed: 0_level_0,count
filename_fr,Unnamed: 1_level_1
WITHDRAWN,58
41265841.pdf,2
41266213.pdf,1
41266067.pdf,1
41265932.pdf,1


total number of filenames not in correlation dataset (including withdrawn): 129


In [136]:
# most of these seem to be appendices
df_missing_from_lang = parsed_docs_df[parsed_docs_df.filename.isin(not_in_lang)]

df_missing_from_lang.sample(20)

Unnamed: 0,filename,url,year
5739,ToRs_Salmon_E.pdf,https://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2007/11/ToRs_Salmon_E.pdf,2007
7335,344588.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/344588.pdf,2012
4145,358355inuk.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/358355inuk.pdf,2002
6037,40615443.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/40615443.pdf,2008
7327,328095.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/328095.pdf,2012
5725,RedfishZAP_ToRs_E_updated.pdf,https://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2007/09/RedfishZAP_ToRs_E_updated.pdf,2007
4668,316077fig5-7.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/316077fig5-7.pdf,2004
3945,266621-app2.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/266621-app2.pdf,2002
2826,236288-app3.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/236288-app3.pdf,1999
5343,Herring4T_agenda_b.pdf,https://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2006/03_Mar/Herring4T_agenda_b.pdf,2006


In [140]:
df_missing_from_parsed_json_df.drop(['nom', 'name'], axis=1).sample(10)

Unnamed: 0,pub_number,year,url_fr,url_en,filename_fr,filename_en,file_url_fr,file_url_en
6072,RES 2021/033,2021,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2021/2021_033-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2021/2021_033-eng.html,41005958.pdf,41005934.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41005958.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41005934.pdf
1282,PRO 2024/040,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-eng.html,41265828.pdf,41265816.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265828.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265816.pdf
7642,SAR 2024/062,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_062-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_062-eng.html,41270757.pdf,41270745.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270757.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270745.pdf
6344,RES 2024/076,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_076-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_076-eng.html,41271038.pdf,41271026.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41271038.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41271026.pdf
8316,SCR 2024/034,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2024/2024_034-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2024/2024_034-eng.html,41265695.pdf,41265683.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265695.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265683.pdf
1321,RES 1977/005,1977,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1977/1977_005-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1977/1977_005-eng.html,WITHDRAWN,WITHDRAWN,WITHDRAWN,WITHDRAWN
404,PRO 2006/021,2006,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-eng.htm,WITHDRAWN,WITHDRAWN,WITHDRAWN,WITHDRAWN
3772,RES 2000/002,2000,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-eng.htm,WITHDRAWN,WITHDRAWN,WITHDRAWN,WITHDRAWN
3776,RES 2000/053,2000,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-eng.htm,WITHDRAWN,WITHDRAWN,WITHDRAWN,WITHDRAWN
7636,SAR 2024/055,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_055-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_055-eng.html,41266110.pdf,41266109.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41266110.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41266109.pdf


In [144]:
only_numeric = [re.sub(r'\D', '', item) for item in not_in_parsed if item != 'WITHDRAWN']
only_numeric = [x for x in only_numeric if x]

In [146]:
only_numeric

['41272948',
 '277195',
 '228257',
 '348570',
 '40702285',
 '41265567',
 '41272936',
 '41266122',
 '2025001',
 '41270800',
 '40928743',
 '41266109',
 '41265592',
 '41265609',
 '41270782',
 '41271129',
 '226300',
 '41266079',
 '41272651',
 '40577053',
 '40884296',
 '41273059',
 '41273023',
 '2024041',
 '40687740',
 '40891240',
 '41271142',
 '41266213',
 '41230516',
 '2024048',
 '2023030',
 '331600',
 '40879227',
 '41270678',
 '41270757',
 '41265749',
 '41265828',
 '41265907',
 '41265920',
 '2023031',
 '41266158',
 '2024063',
 '41266080',
 '41270745',
 '40878314',
 '41270599',
 '41270411',
 '41270794',
 '4127068',
 '41272882',
 '41273047',
 '41266067',
 '41266201',
 '7052012156',
 '4127264',
 '41005958',
 '41271087',
 '2023030',
 '41266110',
 '41265841',
 '2024048',
 '2023032',
 '41272961',
 '41266134',
 '41266055',
 '41270630',
 '41270885',
 '2023033',
 '41272808',
 '41265427',
 '41272626',
 '41265737',
 '41271038',
 '41272870',
 '4126583',
 '4127054',
 '4087574',
 '41265610',
 '4126568

In [147]:
parsed_docs_df[parsed_docs_df['filename'].str.contains('|'.join(only_numeric), na=False)]

Unnamed: 0,filename,url,year


In [148]:
# missing files, files with no correlation data
len(not_in_parsed), len(not_in_lang)

(125, 302)

In [51]:
row = lang_df.iloc[0]
get_filepaths(row)

('..\\ParsedPublications\\2023\\41097178.pdf.json',
 '..\\ParsedPublications\\2023\\41098365.pdf.json')

In [59]:
row

year                   2023
pub_number     PRO 2023/002
filename_fr    41097154.pdf
filename_en    41098377.pdf
Name: 1195, dtype: object

In [60]:
get_filepaths(row)

(None, None)

In [76]:
# create word lists for fr / en

en_words = set()
fr_words = set()

n_error = 0
n_good = 0
errors = list()


for i, row in lang_df.iterrows():
    fr_path, en_path = get_filepaths(row, 1977)
    
    if None in [fr_path, en_path]:
        # print('no results in range')
        n_error += 1
        errors.append(row)
        continue
    else:
        if not os.path.exists(fr_path):
            # print(f'path does not exist: {fr_path}')
            continue
        if not os.path.exists(en_path):
            # print(f'path does not exist: {en_path}')
            continue
    
    n_good += 1


# how is this not getting all of them with 1977?
n_error, n_good

(54, 401)

In [88]:
nom = "Compte rendu de l'examen régional par les pairs de l'application du cadre national d'évaluation de la vulnérabilité dans la région des Maritimes; du 22 au 24 novembre 2021"

fr_eng_correlation_df[fr_eng_correlation_df.nom == nom].T

Unnamed: 0,1281
pub_number,PRO 2024/043
year,2024
nom,Compte rendu de l'examen régional par les pairs de l'application du cadre national d'évaluation de la vulnérabilité dans la région des Maritimes; du 22 au 24 novembre 2021
name,"Proceedings of the Regional Peer Review on the Application of the National Vulnerability Framework in the Maritimes Region; November 22-24, 2021"
url_fr,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_043-fra.html
url_en,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_043-eng.html
filename_fr,41265749.pdf
filename_en,41265737.pdf
file_url_fr,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265749.pdf
file_url_en,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265737.pdf


In [None]:
# TODO: do not seem to have everything downloaded as json - check if they are misnamed or missing



fr_eng_correlation_df


In [85]:
# display(pd.DataFrame(errors).drop(['nom', 'name'], axis=1))
display(pd.DataFrame(errors))


Unnamed: 0,pub_number,year,nom,name,url_fr,url_en,filename_fr,filename_en,file_url_fr,file_url_en
1281,PRO 2024/043,2024,Compte rendu de l'examen régional par les pairs de l'application du cadre national d'évaluation de la vulnérabilité dans la région des Maritimes; du 22 au 24 novembre 2021,"Proceedings of the Regional Peer Review on the Application of the National Vulnerability Framework in the Maritimes Region; November 22-24, 2021",https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_043-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_043-eng.html,41265749.pdf,41265737.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265749.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265737.pdf
1282,PRO 2024/040,2024,Compte rendu de l'examen régional par les pairs du cadre de travail sur le hareng du sud-ouest de la Nouvelle-Écosse et de la baie de Fundy : Partie 2 - Examen du modèle opérationnel de conditionn...,"Proceedings of the Regional Peer Review of the Southwest Nova Scotia/Bay of Fundy Herring Framework: Part 2 - Management Strategy Evaluation Conditioning Operating Model Review; January 20-21, 2020",https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_040-eng.html,41265828.pdf,41265816.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265828.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265816.pdf
1283,PRO 2024/041,2024,Compte rendu de l'examen régional par les pairs de la continuité du cadre de travail sur le hareng du sud-ouest de la Nouvelle-Écosse et de la baie de Fundy : Partie 2 - Examen du modèle opération...,Proceedings of the Regional Peer Review of the Continuation of Southwest Nova Scotia/Bay of Fundy Herring Framework: Part 2 - Management Strategy Evaluation Conditioning Operating Model Developmen...,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_041-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_041-eng.html,41265841.pdf,4126583x.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265841.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126583x.pdf
1284,PRO 2024/041,2024,Compte rendu de l'examen régional par les pairs de la continuité du cadre de travail sur le hareng du sud-ouest de la Nouvelle-Écosse et de la baie de Fundy : Partie 2 - Examen du modèle opération...,Proceedings of the Regional Peer Review of the Continuation of Southwest Nova Scotia/Bay of Fundy Herring Framework: Part 2 - Management Strategy Evaluation Conditioning Operating Model Developmen...,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_041-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_041-eng.html,41265841.pdf,4126583x.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265841.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126583x.pdf
1285,PRO 2024/044,2024,Compte rendu de l'examen par les pairs régional sur l'évaluation du crabe des neiges dans la région des Maritimes ; du 9 au 10 et le 20 mars 2023,"Proceedings of the Regional Peer Review of the Stock Assessment of Snow Crab in Maritimes Region; March 9-10 and 20, 2023",https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_044-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_044-eng.html,41265877.pdf,41265853.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265877.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265853.pdf
1286,PRO 2024/045,2024,Compte rendu de l'examen régional par les pairs pour l'évaluation du capelan des divisions 2J+3KL et l'évaluation des points de référence limites proposés; du 6 au 10 mars 2023,"Proceedings of the Regional Peer Review for the Assessment of Divisions 2J+3KL Capelin and Evaluation of Proposed Limit References Points; March 6-10, 2023",https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_045-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_045-eng.html,41270630.pdf,41270599.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270630.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270599.pdf
1287,PRO 2024/046,2024,Compte rendu des examens par les pairs régional sur l'évaluation de la morue franche (Gadus morhua) et de la plie canadienne (Hippoglossoides platessoides) de la sous-division 3Ps de l'Organisatio...,Proceedings of the Regional Peer Reviews of the Assessment of Northwest Atlantic Fisheries Organization (NAFO) Subdivision 3Ps Atlantic Cod (Gadus morhua) and Subdivision 3Ps American Plaice (Hipp...,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_046-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_046-eng.html,41270551.pdf,4127054x.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41270551.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4127054x.pdf
1288,PRO 2024/047,2024,"Compte rendu de la réunion sur les avis scientifique zonale sur l'examen préalable à l'évaluation du COSEPAC concernant le saumon de l'Atlantique; du 26 au 29 octobre 2020, du 1er au 4 février 202...","Proceedings of the Zonal Advisory Meeting on the Pre-COSEWIC Assessment for Atlantic Salmon; October 26-29, 2020, February 1-4, 2021, December 15-16, 2020, and January 18-22, 2021",https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_047-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_047-eng.html,41273035.pdf,41273023.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41273035.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41273023.pdf
1289,PRO 2024/048,2024,Compte rendu de l'examen régional par les pairs sur la détermination des sites de référence et d'une approche de surveillance pour la zone de protection marine du chenal Laurentien; du 22 au 24 ju...,"Proceedings of the Regional Peer Review on the Identification of Reference Sites and a Scientific Monitoring Approach for the Laurentian Channel Marine Protected Area; June 22-24, 2022",https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_048-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_048-eng.html,41273059.pdf,41273047.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41273059.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41273047.pdf
6278,RES 2023/086,2023,Ajustement d'un modèle bayésien de production excédentaire pour la crevette nordique des stocks du golfe du Saint-Laurent,Adjustment of a Bayesian Surplus Production Model for Northern Shrimp for Stocks in Gulf of St. Lawrence,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2023/2023_086-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2023/2023_086-eng.html,41265592.pdf,41265579.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265592.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41265579.pdf


# testing

# more dictionary options

In [29]:
# !pip install wordlist

In [32]:
# this looks like an AI hallucination, the module doesn't do that 

# from wordlist import words
# 
# english_words = set(words.get_words('en'))
# french_words = set(words.get_words('fr'))

In [34]:
# !pip install pyenchant

In [63]:
# this also looks like an AI hallucination, but the docs do say 'fr' as an example... it just doesn't work 

import enchant

english_dict = enchant.Dict("en_US")
# english_words = {word for word in english_dict}
french_dict = enchant.Dict("fr")
# french_words = {word for word in french_dict}

DictNotFoundError: Dictionary for language 'fr' could not be found
Please check https://pyenchant.github.io/pyenchant/ for details

In [45]:
help(enchant)

Help on package enchant:

NAME
    enchant

DESCRIPTION
    enchant:  Access to the enchant spellchecking library

    This module provides several classes for performing spell checking
    via the Enchant spellchecking library.  For more details on Enchant,
    visit the project website:

        https://abiword.github.io/enchant/

    Spellchecking is performed using 'Dict' objects, which represent
    a language dictionary.  Their use is best demonstrated by a quick
    example::

        >>> import enchant
        >>> d = enchant.Dict("en_US")   # create dictionary for US English
        >>> d.check("enchant")
        True
        >>> d.check("enchnt")
        False
        >>> d.suggest("enchnt")
        ['enchant', 'enchants', 'enchanter', 'penchant', 'incant', 'enchain', 'enchanted']

    Languages are identified by standard string tags such as "en" (English)
    and "fr" (French).  Specific language dialects can be specified by
    including an additional code - for example, "e

In [62]:
english_dict.check('plaice')

True

In [61]:
from enchant.checker import SpellChecker

en_chkr = SpellChecker("en_US")
fr_chkr = SpellChecker("fr")

DefaultLanguageNotFoundError: fr

In [58]:
block = 'Total mortalities at age, based on survey data, are presented in Table'

chkr = SpellChecker("en_US")
chkr.set_text("This is sme sample txt with erors.")

ERROR: sme
ERROR: txt
ERROR: erors


In [59]:
chkr = SpellChecker("en_US")
chkr.set_text("This is text without errors.")
for err in chkr:
    print("ERROR:", err.word)

In [72]:
# https://github.com/dwyl/english-words
en_words_filename = os.path.join("word_lists", "en_words.txt")
with open(en_words_filename, 'r', encoding='utf-8') as f:
    english_words = set(f.read().splitlines())

# https://github.com/51413resu/full-list-of-french-words
fr_words_filename = os.path.join("word_lists", "fr_words.txt")
with open(fr_words_filename, 'r', encoding='utf-8') as f:
    french_words = set(f.read().splitlines())
    
# remove overlapping words
english_words.difference_update(french_words)
french_words.difference_update(english_words)

# remove numeric and "words" with "." in them
english_words.difference_update({w for w in english_words if w.isnumeric() or '.' in w})
french_words.difference_update({w for w in french_words if w.isnumeric() or '.' in w})

In [73]:
block = 'Total mortalities at age, based on survey data, are presented in Table'  # this is french :(

en_count = sum(1 for word in block.split() if word in english_words)
fr_count = sum(1 for word in block.split() if word in french_words)

print('english words:', list(word for word in block.split() if word in english_words))
print('french words:', list(word for word in block.split() if word in french_words))

en_count, fr_count

english words: ['mortalities', 'based', 'survey', 'presented']
french words: ['on', 'are', 'in']


(4, 3)

set()