In [81]:
import json
import os
import random
import re
import tempfile
import pandas as pd

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

# import url data

In [82]:
links_folder = "website_reports"

dataframes = []

for file in os.listdir(links_folder):
    if file.endswith(".xlsx"):
        file_path = os.path.join(links_folder, file)
        df = pd.read_excel(file_path)
        dataframes.append(df)

column_names = ['type', 'year', 'pub_number', '_', 'nom', 'name', 'url_fr', 'url_en', '_', '_', '_', '_']
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = column_names

types = ['RES', 'SAR', 'PRO', 'SSR', 'SCR', 'ESR', 'HSR']
combined_df = combined_df[combined_df.type.isin(types)]
combined_df = combined_df.loc[:, combined_df.columns != '_']

combined_df.sample(10)

Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en
12984,RES,2014,2014/013,"Comparaison de l'âge, de la croissance et de la mortalité d'ombles de fontaine (Salvelinus fontinalis) échantillonnées dans sept lacs de la péninsule Northern sur l'île de Terre-euve afin d'évalue...","Comparing Age, Growth, and Mortality of Brook Trout (Salvelinus fontinalis) sampled from Seven Lakes on the Northern Peninsula, Newfoundland to assess the efficacy of established regulations for T...",http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2014/2014_013-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2014/2014_013-eng.html
4973,RES,1995,1995/114,,"Correspondence analysis of length frequencies of cod samples from 4T, 4Vn, and 4Vs (9 p.)",http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1995/1995_114-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1995/1995_114-eng.html
18993,SCR,2018,2018/053,Évaluation de l'approche des conditions de référence pour la surveillance des activités d'exploitation des placers du Yukon,Evaluation of the reference condition approach for Yukon placer mining monitoring,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2018/2018_053-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2018/2018_053-eng.html
694,PRO,2010,2010/017,Processus consultatif régional sur les possibilités de pêches de la capucette et du balaou dans la partie néo-écossaise du sud du golfe du Saint-Laurent; les 15 et 16 décembre 2009,"Regional Advisory Process on Opportunity for Atlantic silverside and Atlantic saury fisheries in the Nova Scotia portion of the southern Gulf of St. Lawrence; December 15-16, 2009",http://www.dfo-mpo.gc.ca/csas-sccs/publications/pro-cr/2010/2010_017-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/pro-cr/2010/2010_017-eng.htm
19046,SCR,2020,2020/008,"Mise à jour des indices d'abondance jusqu'en 2019 pour les stocks de Plie Rouge de la Div. 4T de l'OPANO, de Plie Grise des Divs. 4RST de l'OPANO et de Merluche Blanche de la Div. 4T de l'OPANO","Updated indices of abundance to 2019 for Winter Flounder from NAFO Div. 4T, Witch Flounder from NAFO Divs. 4RST and White Hake from NAFO Div. 4T",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2020/2020_008-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2020/2020_008-eng.html
11045,RES,2010,2010/091,"Évaluation de stock de crabe des neiges, (Chionoecetes opilio), dans le sud du golfe du Saint-Laurent (zones 12, 19, 12E et 12F) en 2009","The 2009 assessment of snow crab, (Chionoecetes opilio), stock in the southern Gulf of St. Lawrence (Areas 12, 19, 12E and 12F)",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2010/2010_091-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2010/2010_091-eng.html
9826,RES,2007,2007/005,Zones de production de pétoncle de la baie de Fundy : état des stocks pour 2006 et prévisions pour 2007,Scallop Production Areas in the Bay of Fundy: Stock Status for 2006 and Forecast for 2007,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2007/2007_005-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2007/2007_005-eng.htm
1095,PRO,2016,2016/010,"Compte rendu de l'examen zonal par les pairs préalable à l'évaluation du COSEPAC concernant l'esturgeon jaune (Acipenser fulvescens), unités désignables 7-8, au Canada; les 3 et 4 novembre 2015","Proceedings of the zonal peer review of the Pre-COSEWIC Assessment for Lake Sturgeon, Acipenser fulvescens, Designatable Units 7-8, in Canada; November 3-4, 2015",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2016/2016_010-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2016/2016_010-eng.html
1684,RES,1980,1980/001,,Assessment of 4VWX-5-6 pollock,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1980/1980_001-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1980/1980_001-eng.html
18604,SCR,2008,2008/006,Programme de levés géophysiques dans l'ouest de T. N. L. de NWest Energy Inc. Examen de la section sur les relevés relatifs aux pêches commerciales du rapport d'évaluation environnementale,NWest Energy Inc. Western NL Geophysical survey program: review of The EA report,http://www.dfo-mpo.gc.ca/csas-sccs/publications/scr-rs/2008/2008_006-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/scr-rs/2008/2008_006-eng.htm


In [83]:
# most docs with url_fr == url_en seem to be documents with translations in the same doc 
# DO NOT REMOVE THESE

combined_df[combined_df.url_fr == combined_df.url_en].sample(10)

Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en
19688,SSR,1999,1999/A2-02,,Subdivision 3Ps cod,http://waves-vagues.dfo-mpo.gc.ca/Library/234379.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/234379.pdf
359,PRO,2003,2003/017,Procès-verbal de la réunion du souscomité du poisson de fond du CEESP; le 20 mai 2003.,"Proceedings of the PSARC Groundfish Subcommittee Meeting; May 20, 2003.",http://waves-vagues.dfo-mpo.gc.ca/Library/274209.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/274209.pdf
32,PRO,1996,1996/001,,Proceedings of Peer Review and Client Consultations for Diadromous Fish Stocks in the Maritime Provinces in 1995,http://waves-vagues.dfo-mpo.gc.ca/Library/197005.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/197005.pdf
506,PRO,2006,2006/046,Compte rendu de la réunion du Processus consultatif régional concernant la crevette de l'est du plateau néo-écossais; le 5 décembre 2006,Proceedings of the Maritimes Regional Advisory Process on Eastern Scotian Shelf Shrimp; 5 December 2006,http://waves-vagues.dfo-mpo.gc.ca/Library/40511571.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/40511571.pdf
213,PRO,2001,2001/041,Procès-verbale des séances d'examen par les pairs pour le crabe des neiges du Sud du golfe du Saint-Laurent Région de geation des pêches du Golfe; du 13 au 16 février 2001.,"Proceedings of the Peer Review of Snow Crab Stocks in the Gulf of St. Lawrence Gulf Fisheries Management Region; February 13-16, 2001.",http://waves-vagues.dfo-mpo.gc.ca/Library/264840.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/264840.pdf
422,PRO,2004,2004/049,"Réunion du Processus consultatif régional des provinces Maritimes Mise à jour des évaluations des stocks de pétoncle des APP 1,3,4,5 et 6; 8-9 décembre 2004.","Proceedings of the Maritimes Regional Advisory Process Stock Assessment Update of SPA 1,3,4,5 and 6 Scallop Stocks; 8-9 December 2004.",http://waves-vagues.dfo-mpo.gc.ca/Library/319426.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/319426.pdf
456,PRO,2005,2005/029,Compte rendu des réunions du Processus consultatif régional sur les pétoncles de la ZPP 29; le 12 mai 2005.,Proceedings of the Maritime Provinces Regional Advisory Process on Scallops in SFA 29; 12 May 2005.,http://waves-vagues.dfo-mpo.gc.ca/Library/322811.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/322811.pdf
554,PRO,2007,2007/033,Évaluations des pêches dans la rivière Saguenay en 2006; 24 octobre 2006,Assessments of fishery in the Saguenay River in 2006; October 24 2006,http://waves-vagues.dfo-mpo.gc.ca/Library/330790.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/330790.pdf
197,PRO,2000,2000/014,,Proceedings of a Workshop on the Ecosystem Considerations for the Eastern Scotian Shelf Integrated Management (ESSIM) Area; 19-23 June 2000.,http://waves-vagues.dfo-mpo.gc.ca/Library/251122.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/251122.pdf
19858,SSR,2001,2001/C2-07,,Iceland Scallop in Newfoundland and Labrador,http://waves-vagues.dfo-mpo.gc.ca/Library/254392.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/254392.pdf


In [84]:
# how many pdf links are missing?

combined_df[combined_df['url_en'].str.endswith(('html', 'htm'))]['type'].value_counts()

type
RES    5064
SAR    1128
PRO     843
SCR     656
ESR       1
Name: count, dtype: int64

In [85]:
# create formatted pub number with type and number
combined_df['formatted_pub_number'] = combined_df['type'] + " " + combined_df['pub_number']

combined_df.sample(1).T

Unnamed: 0,4016
type,RES
year,1992
pub_number,1992/061
nom,
name,Why was inshore capelin (Mallotus villosus) spawning delayed during 1991?
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1992/1992_061-fra.html
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1992/1992_061-eng.html
formatted_pub_number,RES 1992/061


In [86]:
# how many are html links vs pdf links?
display(combined_df['url_en'].str.endswith(('html', 'htm')).value_counts())
display(combined_df['url_en'].str.endswith('pdf').value_counts())

url_en
True     7692
False    1369
Name: count, dtype: int64

url_en
False    7692
True     1369
Name: count, dtype: int64

# import ParsedPublication data

In [87]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")

type_dict = {
    'RES': ['research document', 'res. doc. no.', 'res. doc.', 'res doc', 'res.doc.', 'res no.', 'resdoc', 'res.', 'res', 'documents de recherche', 'document de recherche'],
    'SCR': ['science response', 'science response report', 'réponse des sciences', 'réponses des sciences', 'réponse scientifique'],
    'PRO': ['proceedings series', 'compte rendu', 'compte rendus', 'comptes rendu', 'comptes rendus'],
    'SAR': ['science advisory report', 'avis scientifique'],
}
data = []
errors = dict()

def extract_pub_number(text, filename):
    text_snippet = re.sub(r"\s+", " ", text[:2000].lower())
    
    for key, phrases in type_dict.items():
        pattern = re.compile(r"(" + "|".join(re.escape(phrase.lower()) for phrase in phrases) + r")", flags=re.IGNORECASE)
        text_snippet = pattern.sub(key, text_snippet)

    # # original chatgpt regex with issue (leaves in spaces, but that prevents normaize and zero padding below)
    # patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]  # 1806
    
    # # copilot regex prompted to fix the issue (got worse, asked for help, same exact code)
    # text_snippet = re.sub(r"(\d)\s+(\d)", r"\1\2", text_snippet)
    # patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}/\d{{1,3}}" for key in type_dict.keys()]
    
    # deepseek-coder-v2:latest
    # patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*(?:\s*\d{1,3})" for key in type_dict.keys()]  # 12680 missing pub numbers
    patterns = [rf"{key}.?(?:\s+no.)?\s+\d{{2,4}}\s*/\s*\d(?:\s*\d){{0,2}}" for key in type_dict.keys()] # 1791 copilot version of the same question
    
    for pattern in patterns:
        match = re.search(pattern, text_snippet, re.IGNORECASE)
        if match:
            pub_number = match.group()
            pub_number = f"{pub_number.split()[0]} {"".join(pub_number.split()[1:])}"  # kevin solution 
            
            # Normalize 2-digit years to 4 digits (e.g., 77 -> 1977)
            pub_number = re.sub(r" (\d{2})/", lambda m: f" 19{m.group(1)}/" if int(m.group(1)) <= 99 else f" {m.group(1)}/", pub_number)
            
            # Zero-pad publication numbers (e.g., 15 -> 015)
            pub_number = re.sub(r"/(\d{1,2})(?!\d)", lambda m: f"/{int(m.group(1)):03}", pub_number)
            return pub_number
    
    errors[filename] = " ".join(text_snippet.split())
    return None

for year_folder in os.listdir(parsed_docs_folder):
    min_year = 1977
    max_year = 2024
    
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            url = json_data.get('url')
                            year = json_data.get('publicationYear')
                            text = json_data.get('text', '')
                            pub_number = extract_pub_number(text, filename)
                            
                            data.append({
                                'filename': filename,
                                'url': url,
                                'year': year,
                                'pub_number': pub_number
                            })

parsed_docs_df = pd.DataFrame(data)

print('\nnumber of missing pub numbers:', len(errors))  # was 1806 (chatgpt with bug), 1990 (copilot fixing the bug) 

Analysing: ..\ParsedPublications\1977
Analysing: ..\ParsedPublications\1978
Analysing: ..\ParsedPublications\1979
Analysing: ..\ParsedPublications\1980
Analysing: ..\ParsedPublications\1981
Analysing: ..\ParsedPublications\1982
Analysing: ..\ParsedPublications\1983
Analysing: ..\ParsedPublications\1984
Analysing: ..\ParsedPublications\1985
Analysing: ..\ParsedPublications\1986
Analysing: ..\ParsedPublications\1987
Analysing: ..\ParsedPublications\1988
Analysing: ..\ParsedPublications\1989
Analysing: ..\ParsedPublications\1990
Analysing: ..\ParsedPublications\1991
Analysing: ..\ParsedPublications\1992
Analysing: ..\ParsedPublications\1993
Analysing: ..\ParsedPublications\1994
Analysing: ..\ParsedPublications\1995
Analysing: ..\ParsedPublications\1996
Analysing: ..\ParsedPublications\1997
Analysing: ..\ParsedPublications\1998
Analysing: ..\ParsedPublications\1999
Analysing: ..\ParsedPublications\2000
Analysing: ..\ParsedPublications\2001
Analysing: ..\ParsedPublications\2002
Analysing: .

In [88]:
parsed_docs_df.sample(10)

Unnamed: 0,filename,url,year,pub_number
9415,40610573.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/40610573.pdf,2016,SCR 2016/031
12475,41240509.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41240509.pdf,2023,SCR 2024/009
3558,256631.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/256631.pdf,2001,PRO 2001/018
698,91385.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/91385.pdf,1986,RES 1986/043
4494,281862.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/281862.pdf,2004,RES 2004/003
5669,343519.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/343519.pdf,2007,PRO 2007/059
12340,4120671x.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4120671x.pdf,2023,SAR 2023/037
1028,115912.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/115912.pdf,1990,RES 1990/007
1089,116594.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/116594.pdf,1990,RES 1990/069
6321,340252.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/340252.pdf,2009,RES 2009/110


In [89]:
# check some random errors (mostly look like stock status reports)

random_error = random.choice(list(errors.keys()))

print(random_error)
print(errors[random_error])

211673.pdf
fisheries pêches and oceans et océans dfo science maritimes region stock status report d3-10 april 1997 atlantic salmon eastern shore nova scotia sfa 20 the fishery the 1996 angling fishery was limited to a hook-and-release fishery except on east river, sheet harbour, where a small salmon harvest was permitted. the angling seasons were extended at the request of client groups on east river sheet harbour by one month until september 30; on the liscomb river by 17 days until september 15; and on the st. mary’s river, by 46 days from an earlier opening date of may 10 to a later closing date of september 30. the seasons on the remaining rivers in the area have not changed for several years and were open from june 1 to august 29 except those in the eastern portion of the area which were open from june 24 to september 22. the atlantic salmon sport catch on these rivers, as estimated from license stubs, was 21 small salmon retained (east river), 862 small salmon released and 335 la

In [90]:
display_in_notepad = False

def display_errors_in_notepad(errors):
    n_char = 400
    
    with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8') as temp_file:
        for k, v in errors.items():
            v = v[:n_char].lower()
            
            if "stock status report" not in v:
                temp_file.write(f'++++++++++++++++++++ {k} ++++++++++++++++++++\n')
                temp_file.write(f'{v}\n\n')
        temp_file_name = temp_file.name
    
    os.system(f'notepad {temp_file_name}')
    os.unlink(temp_file_name)

if display_in_notepad:
    display_errors_in_notepad(errors)

In [91]:
parsed_docs_df['pub_number'].isna().value_counts()  # why doesn't this match len(errors)

pub_number
False    10930
True      1822
Name: count, dtype: int64

# how many parsedpublications matches are missing for documents with translations?

In [92]:
missing_pub_numbers = [x for x in combined_df.formatted_pub_number.to_list() if x not in parsed_docs_df.pub_number.to_list()]

len(missing_pub_numbers)

1076

In [93]:
combined_df[combined_df.formatted_pub_number.isin(missing_pub_numbers) & combined_df['url_en'].str.endswith(('html', 'htm'))].drop(['pub_number', 'name', 'nom'], axis=1).sample(20)

Unnamed: 0,type,year,url_fr,url_en,formatted_pub_number
469,PRO,2005,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-eng.htm,PRO 2005/021
2110,RES,1983,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1983/1983_017-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1983/1983_017-eng.html,RES 1983/017
1899,RES,1981,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1981/1981_041-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1981/1981_041-eng.html,RES 1981/041
6405,RES,1999,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_045-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_045-eng.htm,RES 1999/045
6431,RES,1999,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_055-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_055-eng.htm,RES 1999/055
4329,RES,1993,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/1993/1993_067-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/1993/1993_067-eng.html,RES 1993/067
17051,RES,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_070-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2024/2024_070-eng.html,RES 2024/070
5066,RES,1996,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1996/1996_004-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1996/1996_004-eng.htm,RES 1996/004
1730,RES,1980,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1980/1980_031-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1980/1980_031-eng.html,RES 1980/031
7169,RES,2000,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2000/2000_109-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2000/2000_109-eng.htm,RES 2000/109


In [100]:
combined_df[combined_df.formatted_pub_number == "RES 1999/017"].T

Unnamed: 0,6312
type,RES
year,1999
pub_number,1999/017
nom,
name,Flatfish Stock Assessment for the West Coast of Canada for 1998 and recommended yield options for 1999.
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_017-fra.htm
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_017-eng.htm
formatted_pub_number,RES 1999/017


In [101]:
parsed_docs_df[parsed_docs_df.pub_number == "RES 1999/017"].T  

# TODO (NOTE) IT WORKS!!! -> "Research Document 99/ 1 7" is correctly converted to"RES 1999/017"

Unnamed: 0,2756
filename,233840.pdf
url,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/233840.pdf
year,1999
pub_number,RES 1999/017


In [94]:
# manually check:

# errors['314807.pdf']  # this one is literally the only ESR with out PDF data, I didn't include it in the original script, but I could add manually: (id, fr, en) = (ESR 2004/006, 314807.pdf, 314806.pdf)
# a bunch have been withdrawn 
# errors['102275.pdf'] # not an error: translation is right inside the same document, single pdf (also 186156.pdf, 79442.pdf, and more)

errors.get('277.pdf')  # TODO: this doesn't match because the generated pub_number formatting was incorrect "Res. Doc. 81/ 51" was converted to "RES 1981/ 51", should be "RES 1981/051" 

In [95]:
# TODO: somehow a pub_number of "RES 1981/ 51" was written instead of "RES 1981/051" 

parsed_docs_df[parsed_docs_df.filename == "277.pdf"]

Unnamed: 0,filename,url,year,pub_number
253,277.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/277.pdf,1981,RES 1981/051


In [96]:
# TODO: fix these mistakes
parsed_docs_df[parsed_docs_df['pub_number'].str.contains(r'\s.*\s', na=False)]

Unnamed: 0,filename,url,year,pub_number


# check single non-matching pdf for translations inside of the single pdf

In [97]:
# how many files are just a single link?

print(combined_df[combined_df.url_fr == combined_df.url_en].shape[0])

combined_df[combined_df.url_fr == combined_df.url_en].sample(10)

561


Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en,formatted_pub_number
334,PRO,2003,2003/039,Compte rendu du Processus consultatif régional des Maritimes concernant le crabe des neiges de l'est du plateau néo-écossais; du 26 au 27 février 2003.,Proceedings of the Maritimes Regional Advisory Process of the Eastern Scotian Shelf Snow Crab; 26-27 February 2003.,http://waves-vagues.dfo-mpo.gc.ca/Library/277996.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/277996.pdf,PRO 2003/039
288,PRO,2002,2002/012,,"Proceedings of the National Marine Mammal Review Committee, Quebec, Feb. 18-20, 2002",http://waves-vagues.dfo-mpo.gc.ca/Library/264851.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/264851.pdf,PRO 2002/012
19715,SSR,1999,1999/A6-13,,Redstripe Rockfish British Columbia Coast,http://waves-vagues.dfo-mpo.gc.ca/Library/331718.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331718.pdf,SSR 1999/A6-13
502,PRO,2006,2006/039,"Compte rendu de la réunion du Processus consultatif régional concernant l'évaluation du potentiel de rétablissement du requin taupe bleu, du requin blanc et de la carette de l'Atlantique; 28-30 no...","Proceedings of the Maritime Provinces Recovery Potential Assessment of Atlantic Shortfin Mako, White Shark, and Loggerhead Turtle; 28-30 November 2006",http://waves-vagues.dfo-mpo.gc.ca/Library/326922.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/326922.pdf,PRO 2006/039
181,PRO,2000,2000/015,,"Report of the PSARC Habitat Subcommittee Meeting; August 22, 2000.",http://waves-vagues.dfo-mpo.gc.ca/Library/331484.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331484.pdf,PRO 2000/015
351,PRO,2003,2003/040,Procès-verbal de la réunion du Sous-comité sur le poisson de fond du CEESP; 8 et 9 décembre 2003.,"Proceedings of the PSARC Groundfish Subcommittee Meeting; December 8-9, 2003.",http://waves-vagues.dfo-mpo.gc.ca/Library/345261.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/345261.pdf,PRO 2003/040
486,PRO,2006,2006/026,"Ateliers d'évaluation des dommages acceptables pour les espèces en péril d'eau douce dans la Région du Centre et de l'Arctique; les 18 et 19 octobre 2005, les 8 et 9 février 2006 et les 13 et 14 f...","Allowable Harm Analysis Workshops for Freshwater Species at Risk in Central and Arctic Region; October 18-19, 2005; February 8-9, 2006; and February 13-14, 2006.",http://waves-vagues.dfo-mpo.gc.ca/Library/325726.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/325726.pdf,PRO 2006/026
85,PRO,1998,1998/004,,"Report of the PSARC Invertebrate Subcommittee Meeting January 26, and 28-30, 1998 and the Steering Committee Meeting February 18, 1998",http://waves-vagues.dfo-mpo.gc.ca/Library/359491.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/359491.pdf,PRO 1998/004
379,PRO,2004,2004/020,Compte rendu d'une réunion du Processus consultatif régional au sujet des dommages acceptables au saumon atlantique de l'arrière-baie de Fundy en tant qu'espèce en péril; le 6 avril 2004.,Proceedings of a Regional Advisory Process Meeting on the Level of Allowable Harm for Inner Bay of Fundy Atlantic Salmon in Support of Species at Risk; 6 April 2004.,http://waves-vagues.dfo-mpo.gc.ca/Library/283267.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/283267.pdf,PRO 2004/020
19656,SSR,1999,1999/A2-19,,Newfoundland Region Groundfish Overview,http://waves-vagues.dfo-mpo.gc.ca/Library/317695.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/317695.pdf,SSR 1999/A2-19


# break into paragraph chunks

In [98]:
# TODO: break into paragraph chunks for better correlation 
#  TODO (OPTIONAL): clean excess characters
#  TODO (OPTIONAL): make sure french-friendly encoding is used (at least check if that makes a difference)
