In [121]:
import json
import os
import random
import re
import tempfile
import pandas as pd

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

# import url data

In [122]:
links_folder = "website_reports"

dataframes = []

for file in os.listdir(links_folder):
    if file.endswith(".xlsx"):
        file_path = os.path.join(links_folder, file)
        df = pd.read_excel(file_path)
        dataframes.append(df)

column_names = ['type', 'year', 'pub_number', '_', 'nom', 'name', 'url_fr', 'url_en', '_', '_', '_', '_']
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = column_names

types = ['RES', 'SAR', 'PRO', 'SSR', 'SCR', 'ESR', 'HSR']
combined_df = combined_df[combined_df.type.isin(types)]
combined_df = combined_df.loc[:, combined_df.columns != '_']

combined_df.sample(10)

Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en
18381,SAR,2022,2022/005,Avis de récolte 2020 pour le narval du nord de la baie d'Hudson,2020 Harvest advice for Northern Hudson Bay narwhal,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2022/2022_005-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2022/2022_005-eng.html
721,PRO,2010,2010/028,Réunion d'examen national par les pairs (par téléconférence) : Examen de la « Proposition de réduction du risque de collision entre les navires et les baleines noires dans la région du bassin Rose...,"National Peer Review Meeting (by teleconference): Review of ""A proposal to reduce risk of vessel and right whale collision in the Roseway Basin region of the SW Scotian Shelf""",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2010/2010_028-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2010/2010_028-eng.html
18985,SCR,2018,2018/047,Sommaire des emplacements en colombie britannique (canada) confirmant la présence d'espèces de tuniciers envahissants et du crabe européen en 2017,"Summary of Locations in British Columbia, Canada Supporting Invasive Tunicate Species and European Green Crab as of 2017",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2018/2018_047-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2018/2018_047-eng.html
15229,RES,2019,2019/019,Évaluation du risque pour le saumon rouge du fleuve Fraser attribuable au transfert de la bactérie Renibacterium salmoninarum à partir des fermes de saumon atlantique situées dans la région des îl...,"Assessment of the risk to Fraser River Sockeye Salmon due to Renibacterium salmoninarum transfer from Atlantic Salmon farms in the Discovery Islands area, British Columbia",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2019/2019_019-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2019/2019_019-eng.html
6169,RES,1998,1998/126,,Juvenile Atlantic salmon (Salmo Salar L.) abundance in the Experimental Ponds Area relative to adult returns to the Gander River as an index of marine survival: evidence for increased marine morta...,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1998/1998_126-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1998/1998_126-eng.htm
18830,SCR,2015,2015/037,"Évaluation de la présence, de la répartition et des incidences potentielles du réovirus pisciaire sur la côte ouest de l'Amérique du Nord","Assessment of the Occurrence, Distribution and Potential Impacts of Piscine Reovirus on the West Coast of North America",http://www.dfo-mpo.gc.ca/csas-sccs/publications/scr-rs/2015/2015_037-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/scr-rs/2015/2015_037-eng.html
19476,SSR,1997,1997/G3-01,État de l' océan : plateau néo-écossais - baie de Fundy et golfe du Maine,"State of the Ocean: Scotian Shelf, Bay of Fundy, and Gulf of Maine",http://waves-vagues.dfo-mpo.gc.ca/Library/216816.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/216805.pdf
88,PRO,1998,1998/001,,Proceeding of the 1997 Newfoundland Region Salmonid Stock Assessment Meeting,http://waves-vagues.dfo-mpo.gc.ca/Library/258827.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/258827.pdf
19626,SSR,1998,1998/B3-01,Hareng du sud du golfe du Saint-Laurent,Southern Gulf of St. Lawrence Herring,http://waves-vagues.dfo-mpo.gc.ca/Library/223904.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/40634632.pdf
13999,RES,2016,2016/068,Examen pré-COSEPAC concernant la lompe (Cyclopterus lumpus Linnaeus 1758) dans les eaux canadiennes de l'Atlantique et de l'Arctique,A pre-COSEWIC assessment of the Common Lumpfish (Cyclopterus lumpus) in Canadian Atlantic and Arctic waters,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2016/2016_068-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2016/2016_068-eng.html


In [123]:
# most docs with url_fr == url_en seem to be documents with translations in the same doc 
# DO NOT REMOVE THESE

combined_df[combined_df.url_fr == combined_df.url_en].sample(10)

Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en
19676,SSR,1999,1999/D6-07,,Coho Salmon in the Coastal Waters of the Georgian Basin,http://waves-vagues.dfo-mpo.gc.ca/Library/331685.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331685.pdf
563,PRO,2007,2007/041,Compte rendu de l'examen par le CEESP de l'évaluation du potentiel de rétablissement du naseux de Nooksack et de l'habitat essentiel potentiel du naseux de Nooksack et du meunier de Salish; 25 oct...,"Proceedings of the PSARC review on the recovery potential assessment on Nooksack Dace and potential critical habitat for Nooksack Dace and Salish Sucker; October 25, 2007.",http://waves-vagues.dfo-mpo.gc.ca/Library/331644.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331644.pdf
352,PRO,2003,2003/002,,"Proceedings of the PSARC Groundfish Subcommittee Meeting; January 14-15, 2003",http://waves-vagues.dfo-mpo.gc.ca/Library/40600063.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/40600063.pdf
42,PRO,1997,1997/023,,Report of the PSARC Groundfish Subcommittee Meeting on Pacific Hake and the Steering Committee Meeting,http://waves-vagues.dfo-mpo.gc.ca/Library/226304.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/226304.pdf
31,PRO,1996,1996/003,,Spring Meeting 15-19 April 1996 Regional Advisory Process (RAP) of the Maritimes Region,http://waves-vagues.dfo-mpo.gc.ca/Library/251072.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/251072.pdf
67,PRO,1998,1998/018,,Proceedings of a Workshop on Implementing the Precautionary Approach in Canada; 5-9 October 1998,http://waves-vagues.dfo-mpo.gc.ca/Library/317096.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/317096.pdf
412,PRO,2004,2004/032,,"Proceedings of the PSARC Salmon Subcommittee Meeting; October 19-20, 2004.",http://waves-vagues.dfo-mpo.gc.ca/Library/315898.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/315898.pdf
584,PRO,2007,2007/056,Réunion annuelle de Comité national d'examen par les pairs sur les mammifères marins (CNEPMM); du 29 octobre au 1er novembre 2007,"Proceedings of the Annual Meeting of the National Marine Mammal Peer Review Committee (NMMPRC); October 29 - November 1, 2007",http://waves-vagues.dfo-mpo.gc.ca/Library/335756.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/335756.pdf
345,PRO,2003,2003/035,Compte rendu de l'évaluation des stocks de salmonidés de la région de Terre-Neuve et du Labrador; novembre 2003.,Proceedings of the Newfoundland and Labrador Region Salmonid Stock Assessment Meeting; November 2003.,http://waves-vagues.dfo-mpo.gc.ca/Library/345258.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/345258.pdf
486,PRO,2006,2006/026,"Ateliers d'évaluation des dommages acceptables pour les espèces en péril d'eau douce dans la Région du Centre et de l'Arctique; les 18 et 19 octobre 2005, les 8 et 9 février 2006 et les 13 et 14 f...","Allowable Harm Analysis Workshops for Freshwater Species at Risk in Central and Arctic Region; October 18-19, 2005; February 8-9, 2006; and February 13-14, 2006.",http://waves-vagues.dfo-mpo.gc.ca/Library/325726.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/325726.pdf


In [124]:
# how many pdf links are missing?

combined_df[combined_df['url_en'].str.endswith(('html', 'htm'))]['type'].value_counts()

type
RES    5064
SAR    1128
PRO     843
SCR     656
ESR       1
Name: count, dtype: int64

In [125]:
# create formatted pub number with type and number
combined_df['formatted_pub_number'] = combined_df['type'] + " " + combined_df['pub_number']

combined_df.sample(1).T

Unnamed: 0,12252
type,RES
year,2012
pub_number,2012/051
nom,Une classification côtière physiographique pour la biorégion du plateau néo-écossais et des environs : la côte néo-écossaise et la côte du Nouveau-Brunswick de la baie de Fundy
name,A Physiographic Coastline Classification of the Scotian Shelf Bioregion and Environs: The Nova Scotia Coastline and the New Brunswick Fundy Shore
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2012/2012_051-fra.html
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2012/2012_051-eng.html
formatted_pub_number,RES 2012/051


In [126]:
# how many are html links vs pdf links?
display(combined_df['url_en'].str.endswith(('html', 'htm')).value_counts())
display(combined_df['url_en'].str.endswith('pdf').value_counts())

url_en
True     7692
False    1369
Name: count, dtype: int64

url_en
False    7692
True     1369
Name: count, dtype: int64

# import ParsedPublication data

In [127]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")

type_dict = {
    'RES': ['research document', 'res. doc. no.', 'res. doc.', 'res doc', 'res.doc.', 'res no.', 'resdoc', 'res.', 'res', 'documents de recherche', 'document de recherche'],
    'SCR': ['science response', 'science response report', 'réponse des sciences', 'réponses des sciences', 'réponse scientifique'],
    'PRO': ['proceedings series', 'compte rendu', 'compte rendus', 'comptes rendu', 'comptes rendus'],
    'SAR': ['science advisory report', 'avis scientifique'],
}
data = []
errors = dict()

def extract_pub_number(text, filename):
    text_snippet = re.sub(r"\s+", " ", text[:2000].lower())
    
    for key, phrases in type_dict.items():
        pattern = re.compile(r"(" + "|".join(re.escape(phrase.lower()) for phrase in phrases) + r")", flags=re.IGNORECASE)
        text_snippet = pattern.sub(key, text_snippet)

    # # original chatgpt regex with issue (leaves in spaces, but that prevents normaize and zero padding below)
    # patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]  # 1806
    
    # # copilot regex prompted to fix the issue (got worse, asked for help, same exact code)
    # text_snippet = re.sub(r"(\d)\s+(\d)", r"\1\2", text_snippet)
    # patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}/\d{{1,3}}" for key in type_dict.keys()]
    
    # deepseek-coder-v2:latest
    # patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*(?:\s*\d{1,3})" for key in type_dict.keys()]  # 12680 missing pub numbers
    patterns = [rf"{key}.?(?:\s+no.)?\s+\d{{2,4}}\s*/\s*\d(?:\s*\d){{0,2}}" for key in type_dict.keys()] # 1791 copilot version of the same question
    
    for pattern in patterns:
        match = re.search(pattern, text_snippet, re.IGNORECASE)
        if match:
            pub_number = match.group()
            pub_number = f"{pub_number.split()[0]} {"".join(pub_number.split()[1:])}"  # kevin solution 
            
            # Normalize 2-digit years to 4 digits (e.g., 77 -> 1977)
            pub_number = re.sub(r" (\d{2})/", lambda m: f" 19{m.group(1)}/" if int(m.group(1)) <= 99 else f" {m.group(1)}/", pub_number)
            
            # Zero-pad publication numbers (e.g., 15 -> 015)
            pub_number = re.sub(r"/(\d{1,2})(?!\d)", lambda m: f"/{int(m.group(1)):03}", pub_number)
            return pub_number
    
    errors[filename] = " ".join(text_snippet.split())
    return None

for year_folder in os.listdir(parsed_docs_folder):
    min_year = 1977
    max_year = 2024
    
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            url = json_data.get('url')
                            year = json_data.get('publicationYear')
                            text = json_data.get('text', '')
                            pub_number = extract_pub_number(text, filename)
                            
                            data.append({
                                'filename': filename,
                                'url': url,
                                'year': year,
                                'pub_number': pub_number
                            })

parsed_docs_df = pd.DataFrame(data)

print('\nnumber of missing pub numbers:', len(errors))  # was 1806 (chatgpt with bug), 1990 (copilot fixing the bug) 

Analysing: ..\ParsedPublications\1977
Analysing: ..\ParsedPublications\1978
Analysing: ..\ParsedPublications\1979
Analysing: ..\ParsedPublications\1980
Analysing: ..\ParsedPublications\1981
Analysing: ..\ParsedPublications\1982
Analysing: ..\ParsedPublications\1983
Analysing: ..\ParsedPublications\1984
Analysing: ..\ParsedPublications\1985
Analysing: ..\ParsedPublications\1986
Analysing: ..\ParsedPublications\1987
Analysing: ..\ParsedPublications\1988
Analysing: ..\ParsedPublications\1989
Analysing: ..\ParsedPublications\1990
Analysing: ..\ParsedPublications\1991
Analysing: ..\ParsedPublications\1992
Analysing: ..\ParsedPublications\1993
Analysing: ..\ParsedPublications\1994
Analysing: ..\ParsedPublications\1995
Analysing: ..\ParsedPublications\1996
Analysing: ..\ParsedPublications\1997
Analysing: ..\ParsedPublications\1998
Analysing: ..\ParsedPublications\1999
Analysing: ..\ParsedPublications\2000
Analysing: ..\ParsedPublications\2001
Analysing: ..\ParsedPublications\2002
Analysing: .

In [128]:
parsed_docs_df.sample(10)

Unnamed: 0,filename,url,year,pub_number
7455,347213.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/347213.pdf,2012,RES 2012/052
12567,41235630.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41235630.pdf,2024,PRO 2024/017
1897,199563.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/199563.pdf,1996,RES 1996/129
3058,247919.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/247919.pdf,1999,RES 1999/109
5568,332624.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/332624.pdf,2007,PRO 2008/002
5012,326137.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/326137.pdf,2005,RES 2005/070
6489,340691.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/340691.pdf,2010,SCR 2010/002
10745,40937963.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/40937963.pdf,2019,SAR 2019/018
3852,264037.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/264037.pdf,2002,
5549,332049.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/332049.pdf,2007,PRO 2007/039


In [129]:
# check some random errors (mostly look like stock status reports)

random_error = random.choice(list(errors.keys()))

print(random_error)
print(errors[random_error])

283727.pdf
national capital region habitat status report 2004/002 september 2004 review of scientific information on impacts of seismic sound on fish, invertebrates, marine turtles and marine mammals background a workshop to develop a “decision framework for seismic survey referrals” held in march 2003 produced an inventory of ecological factors that dfo should consider when dealing with referrals for seismic surveys in canadian waters. the workshop also discussed the sources of uncertainty about effects of seismic sounds on those ecological factors, and ways that the uncertainty could be pRESented in science documents evaluating possible impacts. the workshop did not attempt to review critically the scientific literature on impacts of seismic sounds or effectiveness of mitigation options. consequently the meeting did not addRESs tolerances for ecological impacts, if any, or operational standards for RESpecting such tolerances. following that workshop, teams of scientists prepared majo

In [130]:
display_in_notepad = False

def display_errors_in_notepad(errors):
    n_char = 400
    
    with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8') as temp_file:
        for k, v in errors.items():
            v = v[:n_char].lower()
            
            if "stock status report" not in v:
                temp_file.write(f'++++++++++++++++++++ {k} ++++++++++++++++++++\n')
                temp_file.write(f'{v}\n\n')
        temp_file_name = temp_file.name
    
    os.system(f'notepad {temp_file_name}')
    os.unlink(temp_file_name)

if display_in_notepad:
    display_errors_in_notepad(errors)

In [131]:
parsed_docs_df['pub_number'].isna().value_counts()  # why doesn't this match len(errors)

pub_number
False    10930
True      1822
Name: count, dtype: int64

# how many parsedpublications matches are missing for documents with translations?

In [132]:
missing_pub_numbers = [x for x in combined_df.formatted_pub_number.to_list() if x not in parsed_docs_df.pub_number.to_list()]

len(missing_pub_numbers)

1076

In [133]:
combined_df[combined_df.formatted_pub_number.isin(missing_pub_numbers) & combined_df['url_en'].str.endswith(('html', 'htm'))].drop(['pub_number', 'name', 'nom'], axis=1).sample(20)

Unnamed: 0,type,year,url_fr,url_en,formatted_pub_number
18567,SAR,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_058-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2024/2024_058-eng.html,SAR 2024/058
2012,RES,1982,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1982/1982_024-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1982/1982_024-eng.html,RES 1982/024
14928,RES,2019,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2019/2019_002-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2019/2019_002-eng.html,RES 2019/002
7469,RES,2001,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2001/2001_044-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2001/2001_044-eng.htm,RES 2001/044
1961,RES,1981,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1981/1981_078-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1981/1981_078-eng.html,RES 1981/078
7353,RES,2000,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/withdrawn-retire-eng.htm,RES 2000/002
1513,RES,1977,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1977/1977_013-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1977/1977_013-eng.html,RES 1977/013
19269,SCR,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2024/2024_035-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2024/2024_035-eng.html,SCR 2024/035
1797,RES,1980,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1980/1980_070-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1980/1980_070-eng.html,RES 1980/070
19274,SCR,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2024/2024_040-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2024/2024_040-eng.html,SCR 2024/040


In [134]:
combined_df[combined_df.formatted_pub_number == "RES 1999/017"].T

Unnamed: 0,6312
type,RES
year,1999
pub_number,1999/017
nom,
name,Flatfish Stock Assessment for the West Coast of Canada for 1998 and recommended yield options for 1999.
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_017-fra.htm
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_017-eng.htm
formatted_pub_number,RES 1999/017


In [135]:
parsed_docs_df[parsed_docs_df.pub_number == "RES 1999/017"].T  

# TODO (NOTE) IT WORKS!!! -> "Research Document 99/ 1 7" is correctly converted to"RES 1999/017"

Unnamed: 0,2756
filename,233840.pdf
url,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/233840.pdf
year,1999
pub_number,RES 1999/017


In [136]:
# manually check:

# errors['314807.pdf']  # this one is literally the only ESR with out PDF data, I didn't include it in the original script, but I could add manually: (id, fr, en) = (ESR 2004/006, 314807.pdf, 314806.pdf)
# a bunch have been withdrawn 
# errors['102275.pdf'] # not an error: translation is right inside the same document, single pdf (also 186156.pdf, 79442.pdf, and more)

errors.get('277.pdf')  # TODO: this doesn't match because the generated pub_number formatting was incorrect "Res. Doc. 81/ 51" was converted to "RES 1981/ 51", should be "RES 1981/051" 

In [137]:
# TODO: somehow a pub_number of "RES 1981/ 51" was written instead of "RES 1981/051" 

parsed_docs_df[parsed_docs_df.filename == "277.pdf"]

Unnamed: 0,filename,url,year,pub_number
253,277.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/277.pdf,1981,RES 1981/051


In [138]:
# TODO: fix these mistakes
parsed_docs_df[parsed_docs_df['pub_number'].str.contains(r'\s.*\s', na=False)]

Unnamed: 0,filename,url,year,pub_number


# check single non-matching pdf for translations inside of the single pdf

In [139]:
# how many files are just a single link?

print(combined_df[combined_df.url_fr == combined_df.url_en].shape[0])

combined_df[(combined_df.url_fr == combined_df.url_en) & (combined_df.nom.isna())].drop(['pub_number', 'name', 'year'], axis=1).sample(10)

561


Unnamed: 0,type,nom,url_fr,url_en,formatted_pub_number
134,PRO,,http://waves-vagues.dfo-mpo.gc.ca/Library/40618432.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/40618432.pdf,PRO 1999/026
19649,SSR,,http://waves-vagues.dfo-mpo.gc.ca/Library/345641.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/345641.pdf,SSR 1999/C6-02
34,PRO,,http://waves-vagues.dfo-mpo.gc.ca/Library/214851.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/214851.pdf,PRO 1997/002
19679,SSR,,http://waves-vagues.dfo-mpo.gc.ca/Library/331684.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331684.pdf,SSR 1999/D6-03
35,PRO,,http://waves-vagues.dfo-mpo.gc.ca/Library/227077.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/227077.pdf,PRO 1997/009
19671,SSR,,http://waves-vagues.dfo-mpo.gc.ca/Library/331787.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331787.pdf,SSR 1999/C6-09
19657,SSR,,http://waves-vagues.dfo-mpo.gc.ca/Library/331710.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331710.pdf,SSR 1999/A6-08
19669,SSR,,http://waves-vagues.dfo-mpo.gc.ca/Library/331632.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331632.pdf,SSR 1999/D6-10
48,PRO,,http://waves-vagues.dfo-mpo.gc.ca/Library/40618808.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/40618808.pdf,PRO 1997/016
19681,SSR,,http://waves-vagues.dfo-mpo.gc.ca/Library/318227.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/318227.pdf,SSR 1999/D6-01


# combine df based on pub_number 

In [140]:
columns = ['pub_number', 'url', 'url_fr', 'url_en', 'nom', 'name', 'filename', 'year_x', 'year_y']

pd.merge(
    combined_df.drop(['pub_number'], axis=1), 
    parsed_docs_df, 
    left_on="formatted_pub_number", 
    right_on="pub_number"
).drop(['formatted_pub_number'], axis=1)[columns].sample(1).T

Unnamed: 0,1505
pub_number,PRO 2024/010
url,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41230565.pdf
url_fr,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_010-fra.html
url_en,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_010-eng.html
nom,Compte rendu de l'examen régional par les pairs de l'évaluation de la crevette nordique (Pandalus borealis) et de la crevette ésope (P. montagui) dans la zone d'évaluation Est et la zone d'évaluat...
name,Proceedings of the Regional Peer Review on the Stock Assessment of Northern Shrimp (Pandalus borealis) and Striped Shrimp (P. montagui) in the Eastern Assessment Zone and Western Assessment Zone f...
filename,41230565.pdf
year_x,2024
year_y,2023


In [141]:
# how do we know which is fr or en? 
#  why so many duplicates (3+)? 
#  why so many singletons?
parsed_docs_df['pub_number'].value_counts().value_counts()

count
1    5198
2    2798
3      20
4      19
Name: count, dtype: int64

# maybe we need bs4 instead...
* fr and en have the same pub number, so we need to use nlp to check language with this method
* probably easier to just check all of the fr and en websites for pdf links and correlate that way

In [166]:
pd.DataFrame(parsed_docs_df['url'].apply(lambda x: x.rsplit('/', 1)[1].rsplit('.', 1)[1]).value_counts())

Unnamed: 0_level_0,count
url,Unnamed: 1_level_1
pdf,12751
PDF,1


In [169]:
pd.DataFrame(parsed_docs_df['url'].apply(lambda x: x.rsplit('/', 1)[1].rsplit('.', 1)[0].isnumeric()).value_counts())

Unnamed: 0_level_0,count
url,Unnamed: 1_level_1
True,12192
False,560


In [171]:
nonnumeric_df = parsed_docs_df[
    ~parsed_docs_df['url'].apply(lambda x: x.rsplit('/', 1)[1].rsplit('.', 1)[0].isnumeric())
]

In [175]:
# Define a regex pattern to match filenames with the format: numbers + one letter + ".pdf"
pattern = r'^\d+[a-zA-Z]\.pdf$'

# Filter rows where the filename is non-numeric and create a deep copy
non_numeric_filenames = parsed_docs_df[
    ~parsed_docs_df['url'].apply(lambda x: x.rsplit('/', 1)[1].rsplit('.', 1)[0].isnumeric())
].copy()

# Add the new column
non_numeric_filenames['matches_pattern'] = non_numeric_filenames['url'].apply(
    lambda x: re.match(pattern, x.rsplit('/', 1)[1]) is not None
)

# Display rows and whether they match the pattern
print(non_numeric_filenames[['url', 'matches_pattern']])

                                                                        url  \
291      https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/13959f.pdf   
308      https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/33204f.pdf   
328      https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/59302f.pdf   
345      https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/17261f.pdf   
355      https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/21536f.pdf   
...                                                                     ...   
12718  https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4125725x.pdf   
12730  https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126034x.pdf   
12738  https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126051x.pdf   
12743  https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126082x.pdf   
12750  https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/4126096x.pdf   

       matches_pattern  
291               True  
3

In [178]:
# how is this data so ugly?
non_numeric_filenames[~non_numeric_filenames['matches_pattern']]

Unnamed: 0,filename,url,year,pub_number,matches_pattern
1978,215951fre.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/215951fre.pdf,1996,,False
2824,236288-app1.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/236288-app1.pdf,1999,,False
2825,236288-app2.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/236288-app2.pdf,1999,,False
2826,236288-app3.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/236288-app3.pdf,1999,,False
2833,236420-pt.1.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/236420-pt.1.pdf,1999,RES 1999/113,False
...,...,...,...,...,...
11739,Grass-Carp-SE-Risk-Assessment-FR.pdf,https://dfo-mpo.gc.ca/species-especes/documents/profiles-profils/grasscarp-carperoseau/Grass-Carp-SE-Risk-Assessment-FR.pdf,2021,,False
11740,2022_013-inu.pdf,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2022/2022_013-inu.pdf,2022,,False
11741,2022_024-inu.pdf,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2022/2022_024-inu.pdf,2022,SAR 2022/024,False
12140,2023_026-inu.pdf,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2023/2023_026-inu.pdf,2023,,False


In [167]:
pd.DataFrame(parsed_docs_df['url'].apply(lambda x: x.rsplit('/', 1)[0]).value_counts())

Unnamed: 0_level_0,count
url,Unnamed: 1_level_1
https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque,10436
http://waves-vagues.dfo-mpo.gc.ca/Library,2175
https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque,52
https://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2007/11,9
https://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2007/05,8
https://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2007/10,8
https://waves-vagues.dfo-mpo.gc.ca/Library,5
https://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2007/09,5
https://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2006/12_Dec,4
http://www.dfo-mpo.gc.ca/csas-sccs/Schedule-Horraire/2006/02_Feb-Fev,4


In [163]:
combined_df


Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en,formatted_pub_number
1,ESR,2003,2003/002,"État de l'océan en 2001 : Conditions océanographiques physiques sur le plateau néo-écossais, dans la baie de Fundy et dans le golfe du Maine.","2001 State of the Ocean: Physical Oceanographic Conditions on the Scotian Shelf, Bay of Fundy and Gulf of Maine.",http://waves-vagues.dfo-mpo.gc.ca/Library/279108.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/276322.pdf,ESR 2003/002
2,ESR,2003,2003/005,"État de l'océan en 2002 : Conditions océanographiques physiques sur le plateau néo-écossais, dans la baie de Fundy et dans legolfe du Maine.","2002 State of the Ocean: Physical Oceanographic Conditions on the Scotian Shelf, Bay of Fundy and Gulf of Maine.",http://waves-vagues.dfo-mpo.gc.ca/Library/280817.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/281583.pdf,ESR 2003/005
3,ESR,2003,2003/004,État de l'écosystème de l'est du plateau néo-écossais.,State of the Eastern Scotian Shelf Ecosystem.,http://waves-vagues.dfo-mpo.gc.ca/Library/277606.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/276323.pdf,ESR 2003/004
4,ESR,2003,2003/003,État de l'océan en 2002 : Conditions océanographiques physiques dans la Région de Terre-Neuve et du Labrador.,2002 State of the Ocean: Physical oceanographic conditions in the Newfoundland and Labrador Region.,http://waves-vagues.dfo-mpo.gc.ca/Library/336493.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/336490.pdf,ESR 2003/003
5,ESR,2003,2003/001,État de l'océan en 2002 : conditions océanographiques chimiques et biologiques dans la Région de Terre-Neuve et du Labrador,2002 State of the Ocean: Chemical and Biological Oceanographic Conditions in the Newfoundland and Labrador Region.,http://waves-vagues.dfo-mpo.gc.ca/Library/336487.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/276124.pdf,ESR 2003/001
...,...,...,...,...,...,...,...,...
20062,SSR,2004,2004/024,"Crabe des neiges de l'estuaire et du nord du golfe du Saint-Laurent (zones 13 à 17 et 12A, 12B et 12C) en 2003.","Snow Crab of the Estuary and Northern Gulf of St. Lawrence (areas 13 to 17 and 12A, 12B and 12C) in 2003.",http://waves-vagues.dfo-mpo.gc.ca/Library/280837.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/281603.pdf,SSR 2004/024
20063,SSR,2004,2004/048,Cadre révisé pour l'évaluation de l'étendue des dommages admissibles en vertu de l'article 73 de la Loi sur les espèces en péril.,Revised Framework for Evaluation of Scope for Harm under Section 73 of the Species at Risk Act.,http://waves-vagues.dfo-mpo.gc.ca/Library/314616.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/286840.pdf,SSR 2004/048
20064,SSR,2004,2004/043,Évaluation des dommages admissibles pour le bocaccio.,Allowable Harm Assessment for Bocaccio.,http://waves-vagues.dfo-mpo.gc.ca/Library/345849.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/316488.pdf,SSR 2004/043
20065,SSR,2004,2004/025,Morue charbonnière.,Sablefish - Stock Status Report 2004.,http://waves-vagues.dfo-mpo.gc.ca/Library/344778.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/344777.pdf,SSR 2004/025


# break into paragraph chunks

In [143]:
# TODO: break into paragraph chunks for better correlation 
#  TODO (OPTIONAL): clean excess characters
#  TODO (OPTIONAL): make sure french-friendly encoding is used (at least check if that makes a difference)
