In [1]:
import json
import os
import random
import re
import tempfile
import pandas as pd

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

# import url data

In [2]:
links_folder = "website_reports"

dataframes = []

for file in os.listdir(links_folder):
    if file.endswith(".xlsx"):
        file_path = os.path.join(links_folder, file)
        df = pd.read_excel(file_path)
        dataframes.append(df)

column_names = ['type', 'year', 'pub_number', '_', 'nom', 'name', 'url_fr', 'url_en', '_', '_', '_', '_']
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = column_names
display(combined_df.head())

types = ['RES', 'SAR', 'PRO', 'SSR', 'SCR', 'ESR', 'HSR']
combined_df = combined_df[combined_df.type.isin(types)]
combined_df = combined_df.loc[:, combined_df.columns != '_']

Unnamed: 0,type,year,pub_number,_,nom,name,url_fr,url_en,_.1,_.2,_.3,_.4
0,SERIES,YEAR,PUBLICATION NUMBER,AUTHORS,TITLE FRENCH,TITLE ENGLISH,URL FRENCH,URL ENGLISH,URL INUKTITUT,REGION ENGLISH,REGION FRENCH,DATE PUBLISHED
1,ESR,2003,2003/002,DFO-MPO,"État de l'océan en 2001 : Conditions océanographiques physiques sur le plateau néo-écossais, dans la baie de Fundy et dans le golfe du Maine.","2001 State of the Ocean: Physical Oceanographic Conditions on the Scotian Shelf, Bay of Fundy and Gulf of Maine.",http://waves-vagues.dfo-mpo.gc.ca/Library/279108.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/276322.pdf,,Maritimes,Maritimes,2003-05-02 00:00:00
2,ESR,2003,2003/005,DFO-MPO,"État de l'océan en 2002 : Conditions océanographiques physiques sur le plateau néo-écossais, dans la baie de Fundy et dans legolfe du Maine.","2002 State of the Ocean: Physical Oceanographic Conditions on the Scotian Shelf, Bay of Fundy and Gulf of Maine.",http://waves-vagues.dfo-mpo.gc.ca/Library/280817.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/281583.pdf,,Maritimes,Maritimes,
3,ESR,2003,2003/004,DFO-MPO,État de l'écosystème de l'est du plateau néo-écossais.,State of the Eastern Scotian Shelf Ecosystem.,http://waves-vagues.dfo-mpo.gc.ca/Library/277606.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/276323.pdf,,Maritimes,Maritimes,
4,ESR,2003,2003/003,DFO-MPO,État de l'océan en 2002 : Conditions océanographiques physiques dans la Région de Terre-Neuve et du Labrador.,2002 State of the Ocean: Physical oceanographic conditions in the Newfoundland and Labrador Region.,http://waves-vagues.dfo-mpo.gc.ca/Library/336493.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/336490.pdf,,Newfoundland & Labrador,Terre-Neuve et Labrador,2004-01-12 00:00:00


In [3]:
combined_df.sample(10)

Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en
7476,RES,2001,2001/045,Évaluation des stocks de hareng de la zone 4T de l'OPANO dans le sud du golfe du Saint-Laurent en 2000,Assessment of the 4T southern Gulf of St. Lawrence herring stocks in 2000,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2001/2001_045-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2001/2001_045-eng.htm
5793,RES,1997,1997/141,,A retrospective analysis of escapement model performance using different adult survival rate estimate.,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1997/1997_141-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1997/1997_141-eng.htm
9191,RES,2005,2005/032,"Évaluation de la sensibilité de l'habitat marin : une étude de cas sur la zostère marine (Zostera marina L.) et des laminaires (Laminaria, Macrocystis).","Assessing marine habitat sensitivity: a case study with eelgrass (Zostera marina L.) and kelps (Laminaria, Macrocystis).",http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2005/2005_032-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/2005/2005_032-eng.htm
17396,SAR,2009,2009/008,Évaluation du stock de morue de la sous-division 3Ps,Stock Assessment of Subdivision 3Ps Cod,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2009/2009_008-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2009/2009_008-eng.htm
12082,RES,2012,2012/092,Distribution de l'effort de pêche à la crevette nordique dans l'estuaire et le golfe du Saint-Laurent,Distribution of Northern shrimp fishing effort in the Estuary and Gulf of St. Lawrence,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2012/2012_092-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2012/2012_092-eng.html
1769,RES,1980,1980/054,,An evaluation of the current status of southern Gulf herring,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1980/1980_054-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1980/1980_054-eng.html
6470,RES,1999,1999/069,,"Assessment of the Banquereau Bank Artic Surfclam, 1999.",http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_069-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_069-eng.htm
6399,RES,1999,1999/043,,"Changes in the timing and location of cod spawning in Placentia Bay (NAFO sub-division 3Ps), 1997-1998.",http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_043-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_043-eng.htm
12401,RES,2012,2012/151,Prises accessoires de la pêche à la crevette nordique dans l'estuaire et le golfe du Saint-Laurent,Bycatch in the Estuary and Gulf of St. Lawrence Northern shrimp fishery,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2012/2012_151-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2012/2012_151-eng.html
2598,RES,1985,1985/078,,Assessment of the 1984 4WX herring fishery,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1985/1985_078-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1985/1985_078-eng.html


### remove untranslated documents

In [19]:
# # first take a look at value counts
# display(combined_df.loc[combined_df.url_fr == combined_df.url_en, 'year'].value_counts())
# display(combined_df.loc[combined_df.url_fr != combined_df.url_en, 'year'].value_counts())
# display(combined_df.loc[combined_df.url_fr == combined_df.url_en, 'type'].value_counts())
# display(combined_df.loc[combined_df.url_fr != combined_df.url_en, 'type'].value_counts())

combined_df = combined_df[combined_df.url_fr != combined_df.url_en]

combined_df[combined_df['url_en'].str.endswith(('html', 'htm'))]['type'].value_counts()

type
RES    5063
SAR    1126
PRO     840
SCR     655
ESR       1
Name: count, dtype: int64

### create formatted pub number with type and number

In [24]:
combined_df['formatted_pub_number'] = combined_df['type'] + " " + combined_df['pub_number'] 

In [30]:
combined_df.sample(1).T

Unnamed: 0,5936
type,RES
year,1998
pub_number,1998/046
nom,
name,"Analyses of trends in returns of Atlantic salmon (Salmo salar) to rivers in Nova Scotia and Bay of Fundy, New Brunswick, and status of 1997 returns relative to forecasts."
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1998/1998_046-fra.htm
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1998/1998_046-eng.htm
formatted_pub_number,RES 1998/046


# import ParsedPublication data

In [147]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")

type_dict = {
    'RES': ['research document', 'documents de recherche', 'document de recherche', 'res doc', 'res no', 'resdoc', 'res'],
    'SCR': ['science response report', 'science response', 'reponses des sciences', 'reponses des science', 'reponse scientifique'],
    'PRO': ['proceedings series', 'comptes rendus', 'compte rendus', 'comptes rendu', 'compte rendu'],
    'SAR': ['science advisory report', 'avis scientifique'],
}
data = []
errors = dict()

def extract_pub_number(text, filename):
    text_snippet = re.sub(r"\s+", " ", text[:2000].replace(".", "").replace('é', 'e').lower())
    
    # TODO = REMOVE: old versions for testing = this is the old borderless bug version, but has more results
    # for key, phrases in type_dict.items():  
    #     pattern = re.compile("|".join(re.escape(phrase.lower()) for phrase in phrases), flags=re.IGNORECASE)
    #     text_snippet = pattern.sub(key, text_snippet)
    for key, phrases in type_dict.items():
        pattern = re.compile(r"\b(" + "|".join(re.escape(phrase.lower()) for phrase in phrases) + r")\b", flags=re.IGNORECASE)
        text_snippet = pattern.sub(key, text_snippet)
    
    # TODO = REMOVE: old versions for testing = this is the old borderless bug version, but has more results
    # patterns = [rf"{key}(?:\s+no\.)?\s+\d{{2,4}}/\d{{1,3}}" for key in type_dict.keys()]  
    patterns = [rf"{key}(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]
    # patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]
    # patterns = [rf"{key}(?:\s+doc)?(?:\s+no)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]
    # patterns = [rf"{key}\.?(\s+doc\.?)?(?:\s+no\.?)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]
    
    for pattern in patterns:
        match = re.search(pattern, text_snippet, re.IGNORECASE)
        if match:
            pub_number = match.group()
            
            # Normalize 2-digit years to 4 digits (e.g., 77 -> 1977)
            pub_number = re.sub(r" (\d{2})/", lambda m: f" 19{m.group(1)}/" if int(m.group(1)) <= 99 else f" {m.group(1)}/", pub_number)
            
            # Zero-pad publication numbers (e.g., 15 -> 015)
            pub_number = re.sub(r"/(\d{1,2})(?!\d)", lambda m: f"/{int(m.group(1)):03}", pub_number)
            return pub_number
    
    errors[filename] = " ".join(text_snippet.split())
    return None

for year_folder in os.listdir(parsed_docs_folder):
    min_year = 1977
    max_year = 2024
    
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            url = json_data.get('url')
                            year = json_data.get('publicationYear')
                            text = json_data.get('text', '')
                            pub_number = extract_pub_number(text, filename)
                            
                            data.append({
                                'filename': filename,
                                'url': url,
                                'year': year,
                                'pub_number': pub_number
                            })

parsed_docs_df = pd.DataFrame(data)

Analysing: ..\ParsedPublications\1977
Analysing: ..\ParsedPublications\1978
Analysing: ..\ParsedPublications\1979
Analysing: ..\ParsedPublications\1980
Analysing: ..\ParsedPublications\1981
Analysing: ..\ParsedPublications\1982
Analysing: ..\ParsedPublications\1983
Analysing: ..\ParsedPublications\1984
Analysing: ..\ParsedPublications\1985
Analysing: ..\ParsedPublications\1986
Analysing: ..\ParsedPublications\1987
Analysing: ..\ParsedPublications\1988
Analysing: ..\ParsedPublications\1989
Analysing: ..\ParsedPublications\1990
Analysing: ..\ParsedPublications\1991
Analysing: ..\ParsedPublications\1992
Analysing: ..\ParsedPublications\1993
Analysing: ..\ParsedPublications\1994
Analysing: ..\ParsedPublications\1995
Analysing: ..\ParsedPublications\1996
Analysing: ..\ParsedPublications\1997
Analysing: ..\ParsedPublications\1998
Analysing: ..\ParsedPublications\1999
Analysing: ..\ParsedPublications\2000
Analysing: ..\ParsedPublications\2001
Analysing: ..\ParsedPublications\2002
Analysing: .

In [149]:
# missing "Compte rendus 2020/004" - need borders (files: 40874424.pdf, 40876032.pdf)
# # 3521 for 400char at 150s
# # 1991 for 2000char at 13s / 12s / 3min (wut)
# # 1963 for 3000char at 14s / 14s
# # 1944 for 5000char at 120s    
# # 1820 for 10000char at 31s / 108s / 212s (what is happening?)
# len(errors)

# UPDATED WITH BOUNDARIES
# 1900 for 2000char at 3m7s 
len(errors)

2459

In [158]:
# # the only one after 1982 - new keyword added
# errors['4072296x.pdf']

In [155]:
# older version before the version above was "FIXED"

parsed_docs_folder = os.path.join("..", "ParsedPublications")

type_dict = {
    'RES': ['research document', 'res. doc.', 'res doc', 'res.doc.', 'res no.', 'resdoc', 'res.', 'res', 'documents de recherche', 'document de recherche'],
    'SCR': ['science response', 'science response report', 'réponse des sciences', 'réponses des sciences', 'réponse scientifique'],
    'PRO': ['proceedings series', 'compte rendu', 'compte rendus', 'comptes rendu', 'comptes rendus'],
    'SAR': ['science advisory report', 'avis scientifique'],
}
data = []
errors = dict()

def extract_pub_number(text, filename):
    text_snippet = re.sub(r"\s+", " ", text[:2000].lower())
    

    for key, phrases in type_dict.items():
        pattern = re.compile(r"\b(" + "|".join(re.escape(phrase.lower()) for phrase in phrases) + r")\b", flags=re.IGNORECASE)
        text_snippet = pattern.sub(key, text_snippet)
    

    patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]

    
    for pattern in patterns:
        match = re.search(pattern, text_snippet, re.IGNORECASE)
        if match:
            pub_number = match.group()
            
            # Normalize 2-digit years to 4 digits (e.g., 77 -> 1977)
            pub_number = re.sub(r" (\d{2})/", lambda m: f" 19{m.group(1)}/" if int(m.group(1)) <= 99 else f" {m.group(1)}/", pub_number)
            
            # Zero-pad publication numbers (e.g., 15 -> 015)
            pub_number = re.sub(r"/(\d{1,2})(?!\d)", lambda m: f"/{int(m.group(1)):03}", pub_number)
            return pub_number
    
    errors[filename] = " ".join(text_snippet.split())
    return None

for year_folder in os.listdir(parsed_docs_folder):
    min_year = 1977
    max_year = 2024
    
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            url = json_data.get('url')
                            year = json_data.get('publicationYear')
                            text = json_data.get('text', '')
                            pub_number = extract_pub_number(text, filename)
                            
                            data.append({
                                'filename': filename,
                                'url': url,
                                'year': year,
                                'pub_number': pub_number
                            })

parsed_docs_df = pd.DataFrame(data)

len(errors)

Analysing: ..\ParsedPublications\1977
Analysing: ..\ParsedPublications\1978
Analysing: ..\ParsedPublications\1979
Analysing: ..\ParsedPublications\1980
Analysing: ..\ParsedPublications\1981
Analysing: ..\ParsedPublications\1982
Analysing: ..\ParsedPublications\1983
Analysing: ..\ParsedPublications\1984
Analysing: ..\ParsedPublications\1985
Analysing: ..\ParsedPublications\1986
Analysing: ..\ParsedPublications\1987
Analysing: ..\ParsedPublications\1988
Analysing: ..\ParsedPublications\1989
Analysing: ..\ParsedPublications\1990
Analysing: ..\ParsedPublications\1991
Analysing: ..\ParsedPublications\1992
Analysing: ..\ParsedPublications\1993
Analysing: ..\ParsedPublications\1994
Analysing: ..\ParsedPublications\1995
Analysing: ..\ParsedPublications\1996
Analysing: ..\ParsedPublications\1997
Analysing: ..\ParsedPublications\1998
Analysing: ..\ParsedPublications\1999
Analysing: ..\ParsedPublications\2000
Analysing: ..\ParsedPublications\2001
Analysing: ..\ParsedPublications\2002
Analysing: .

1900

In [193]:
# wish.com regex

parsed_docs_folder = os.path.join("..", "ParsedPublications")

type_dict = {
    'RES': ['research document', 'res. doc.', 'res doc', 'res.doc.', 'res no.', 'resdoc', 'res.', 'res', 'documents de recherche', 'document de recherche'],
    'SCR': ['science response', 'science response report', 'réponse des sciences', 'réponses des sciences', 'réponse scientifique'],
    'PRO': ['proceedings series', 'compte rendu', 'compte rendus', 'comptes rendu', 'comptes rendus'],
    'SAR': ['science advisory report', 'avis scientifique'],
}
data = []
errors = dict()

def extract_pub_number(text, filename):
    text_snippet = re.sub(r"\s+", " ", text[:200].replace("\xa0", " ").lower())
    
    for key, phrases in type_dict.items():
        for phrase in phrases:
            pattern = re.compile(rf"\b{re.escape(phrase.lower())}\b", flags=re.IGNORECASE)
            print(pattern)
            text_snippet = pattern.sub(key, text_snippet)
            print(text_snippet)
    
    patterns = [rf"{key}\.?(\s+doc)?(\s+no)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]
    
    
    for pattern in patterns:
        match = re.search(pattern, text_snippet, re.IGNORECASE)
        if match:
            pub_number = match.group()
            
            # Normalize 2-digit years to 4 digits (e.g., 77 -> 1977)
            pub_number = re.sub(r" (\d{2})/", lambda m: f" 19{m.group(1)}/" if int(m.group(1)) <= 99 else f" {m.group(1)}/", pub_number)
            
            # Zero-pad publication numbers (e.g., 15 -> 015)
            pub_number = re.sub(r"/(\d{1,2})(?!\d)", lambda m: f"/{int(m.group(1)):03}", pub_number)
            return pub_number
    
    errors[filename] = " ".join(text_snippet.split())
    return None

for year_folder in os.listdir(parsed_docs_folder):
    min_year = 1977
    max_year = 1977
    
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            url = json_data.get('url')
                            year = json_data.get('publicationYear')
                            text = json_data.get('text', '')
                            pub_number = extract_pub_number(text, filename)
                            
                            data.append({
                                'filename': filename,
                                'url': url,
                                'year': year,
                                'pub_number': pub_number
                            })
                            break

parsed_docs_df = pd.DataFrame(data)

len(errors)

Analysing: ..\ParsedPublications\1977
re.compile('\\bresearch\\ document\\b', re.IGNORECASE)
 fr'• r res. doc. 77/1 this report not to be cited without prior reference to the author . tue grouping of herring populations in the southern gulf of st. lawrence by d.m. ware fine ecology labo
re.compile('\\bres\\.\\ doc\\.\\b', re.IGNORECASE)
 fr'• r res. doc. 77/1 this report not to be cited without prior reference to the author . tue grouping of herring populations in the southern gulf of st. lawrence by d.m. ware fine ecology labo
re.compile('\\bres\\ doc\\b', re.IGNORECASE)
 fr'• r res. doc. 77/1 this report not to be cited without prior reference to the author . tue grouping of herring populations in the southern gulf of st. lawrence by d.m. ware fine ecology labo
re.compile('\\bres\\.doc\\.\\b', re.IGNORECASE)
 fr'• r res. doc. 77/1 this report not to be cited without prior reference to the author . tue grouping of herring populations in the southern gulf of st. lawrence by d.m. ware f

1

In [200]:
# get rid of border because wrecks periods, sorted lists should be good enough
# TODO: clean up. I think this thing actually works, clean up then update replacements list 

parsed_docs_folder = os.path.join("..", "ParsedPublications")

type_dict = {
    'RES': ['research document', 'res. doc.', 'res doc', 'res.doc.', 'res no.', 'resdoc', 'res.', 'res', 'documents de recherche', 'document de recherche'],
    'SCR': ['science response', 'science response report', 'réponse des sciences', 'réponses des sciences', 'réponse scientifique'],
    'PRO': ['proceedings series', 'compte rendu', 'compte rendus', 'comptes rendu', 'comptes rendus'],
    'SAR': ['science advisory report', 'avis scientifique'],
}
data = []
errors = dict()

def extract_pub_number(text, filename):
    text_snippet = re.sub(r"\s+", " ", text[:2000].lower())
    

    for key, phrases in type_dict.items():
        pattern = re.compile(r"(" + "|".join(re.escape(phrase.lower()) for phrase in phrases) + r")", flags=re.IGNORECASE)
        text_snippet = pattern.sub(key, text_snippet)
    

    patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]

    
    for pattern in patterns:
        match = re.search(pattern, text_snippet, re.IGNORECASE)
        if match:
            pub_number = match.group()
            
            # Normalize 2-digit years to 4 digits (e.g., 77 -> 1977)
            pub_number = re.sub(r" (\d{2})/", lambda m: f" 19{m.group(1)}/" if int(m.group(1)) <= 99 else f" {m.group(1)}/", pub_number)
            
            # Zero-pad publication numbers (e.g., 15 -> 015)
            pub_number = re.sub(r"/(\d{1,2})(?!\d)", lambda m: f"/{int(m.group(1)):03}", pub_number)
            return pub_number
    
    errors[filename] = " ".join(text_snippet.split())
    return None

for year_folder in os.listdir(parsed_docs_folder):
    min_year = 1977
    max_year = 2024
    
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            url = json_data.get('url')
                            year = json_data.get('publicationYear')
                            text = json_data.get('text', '')
                            pub_number = extract_pub_number(text, filename)
                            
                            data.append({
                                'filename': filename,
                                'url': url,
                                'year': year,
                                'pub_number': pub_number
                            })

parsed_docs_df = pd.DataFrame(data)

len(errors)

Analysing: ..\ParsedPublications\1977
Analysing: ..\ParsedPublications\1978
Analysing: ..\ParsedPublications\1979
Analysing: ..\ParsedPublications\1980
Analysing: ..\ParsedPublications\1981
Analysing: ..\ParsedPublications\1982
Analysing: ..\ParsedPublications\1983
Analysing: ..\ParsedPublications\1984
Analysing: ..\ParsedPublications\1985
Analysing: ..\ParsedPublications\1986
Analysing: ..\ParsedPublications\1987
Analysing: ..\ParsedPublications\1988
Analysing: ..\ParsedPublications\1989
Analysing: ..\ParsedPublications\1990
Analysing: ..\ParsedPublications\1991
Analysing: ..\ParsedPublications\1992
Analysing: ..\ParsedPublications\1993
Analysing: ..\ParsedPublications\1994
Analysing: ..\ParsedPublications\1995
Analysing: ..\ParsedPublications\1996
Analysing: ..\ParsedPublications\1997
Analysing: ..\ParsedPublications\1998
Analysing: ..\ParsedPublications\1999
Analysing: ..\ParsedPublications\2000
Analysing: ..\ParsedPublications\2001
Analysing: ..\ParsedPublications\2002
Analysing: .

1806

In [198]:
test_phrase = "fr'• r res. doc. 77/1 this report not to be cited without prior reference..."

# does not work
print(re.compile(rf"\b{re.escape("res.".lower())}\b", flags=re.IGNORECASE).sub('RES', test_phrase))

# works
print(re.compile(rf"\b{re.escape("res".lower())}\b", flags=re.IGNORECASE).sub('RES', test_phrase))


fr'• r res. doc. 77/1 this report not to be cited without prior reference...
fr'• r RES. doc. 77/1 this report not to be cited without prior reference...


In [199]:
test_phrase = "fr'• r res. doc. 77/1 this report not to be cited without prior reference..."

print(re.compile(rf"{re.escape("res.".lower())}", flags=re.IGNORECASE).sub('RES', test_phrase))
print(re.compile(rf"{re.escape("res".lower())}", flags=re.IGNORECASE).sub('RES', test_phrase))

fr'• r RES doc. 77/1 this report not to be cited without prior reference...
fr'• r RES. doc. 77/1 this report not to be cited without prior reference...


In [185]:
random_error = random.choice(list(errors.keys()))

print(random_error)
print(errors[random_error])

212967.pdf
fisheries pêches and oceans et océans mpo science région des maritimes rapport sur l’état des stocks a3-02 juin 1997 morue de sydney bight renseignements de base la sous-division 4vn est une zone ou viennent se mélanger les morues résidantes et les grands stocks des zones avoisinantes, le stock de 4t à l’ouest et le stock de 4vsw au sud. de plus, la morue de 4t passe l’hiver le long de la limite supérieure du talus continental, de sydney bight jusque dans la région du banquereau, quittant le golfe à la fin de l’automne pour y revenir au printemps. pendant cette période, les captures de morue dans la sous-division 4vn incluent des morues résidantes et des morues du golfe, bien que la morue de 4t compose la plus grande partie des prises, puisqu’il s’agit d’un stock beaucoup plus important. ainsi, des quantités inconnues de morue de 4vn sont capturées pendant la période d’hivernage. le mélange des morues du golfe du saint-laurent (4t) avec le stock résidant et l’incapacité de r

In [186]:
# errors_10k = errors.copy()
# errors_3k = errors.copy()
# errors_2k = errors.copy()
# errors_2k_v2 = errors.copy()
# error_diff_10k = [k for k in errors_2k if k not in errors_10k]
# errors_2k_v3 = errors.copy() 

In [188]:
errors_2k_v2['75586.pdf']

'fr\'• r RES. doc. 77/1 this report not to be cited without prior reference to the author . tue grouping of herring populations in the southern gulf of st. lawrence by d.m. ware fine ecology laboratory, bedford institute of oceanography, dartmouth, n.s. introduction in the course of analysing the growth of herring larvae in st. georges bay, nova scotia, i was puzzled by the fact that the observed growth rates, which agreed favourably with other estimates, seemed to be inconsistent with the back-calculated lengths-at-age 1. this report summarizes the problem, as i see it, that might exist with our current age determination and grouping of herring\'pbpulations in the southern gulf of st. lawrence. although the interpretation i suggest reconciles the larval growth rates with the back-calculated lengths-at-age 1, several points are quite speculative and require verification, or revision, on the basis of new evidence. however (from a management point of view), the implications of the existi

In [187]:
error_diff = [k for k in errors_2k_v3 if k not in errors_2k]
# these are from references, 10k is too long, 
#  even 3k is too long for some of them, some are correct...
#   96/87F, 96/92F - what is the F? 
parsed_docs_df[parsed_docs_df.filename.isin(error_diff)]

Unnamed: 0,filename,url,year,pub_number
0,75586.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75586.pdf,1977,
2,75588.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75588.pdf,1977,
3,75589.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75589.pdf,1977,
4,75590.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75590.pdf,1977,
5,75591.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75591.pdf,1977,
6,75593.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75593.pdf,1977,
7,75594.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75594.pdf,1977,
8,75595.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75595.pdf,1977,
14,75601.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75601.pdf,1977,
15,75603.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75603.pdf,1977,


In [83]:
parsed_docs_df.head()

Unnamed: 0,filename,url,year,pub_number
0,75586.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75586.pdf,1977,RES 1977/001
1,75587.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75587.pdf,1977,
2,75588.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75588.pdf,1977,RES 1977/003
3,75589.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75589.pdf,1977,RES 1977/004
4,75590.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75590.pdf,1977,RES 1977/006


In [73]:
error_diff = {k: v for k, v in errors_2k.items() if k not in errors_10k}

In [74]:
def display_errors_in_notepad(errors):
    with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8') as temp_file:
        for k, v in errors.items():
            v = v.lower()
            
            if "stock status report" not in v:
                temp_file.write(f'++++++++++++++++++++ {k} ++++++++++++++++++++\n')
                temp_file.write(f'{v}\n\n')
        temp_file_name = temp_file.name
    
    os.system(f'notepad {temp_file_name}')
    os.unlink(temp_file_name)


display_errors_in_notepad(error_diff)

In [8]:
parsed_docs_df['pub_number'].isna().value_counts()

pub_number
False    6742
True     6010
Name: count, dtype: int64

In [18]:
parsed_docs_df.sample(10)

Unnamed: 0,filename,url,year,pub_number
4069,276692.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/276692.pdf,2002,
1426,167904.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/167904.pdf,1994,
1809,197630.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/197630.pdf,1996,
8660,364202.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/364202.pdf,2014,Science Advisory Report 2014/040
4797,315755.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/315755.pdf,2005,
12396,41220031.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41220031.pdf,2023,Réponse des Sciences 2023/044
5606,335277.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/335277.pdf,2007,Proceedings Series 2008/001
8945,364478.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/364478.pdf,2015,Science Advisory Report 2015/003
4385,282442.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/282442.pdf,2003,
12626,41243924.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/41243924.pdf,2024,Avis scientifique 2024/027


# create pub_number for combined_df

# check all json for matches by year, type, pub_number

# Plan B: get json filename from url using BS4


In [14]:
# how many are html links vs pdf links?
display(combined_df['url_en'].str.endswith(('html', 'htm')).value_counts())
display(combined_df['url_en'].str.endswith('pdf').value_counts())


url_en
True     7685
False     815
Name: count, dtype: int64

url_en
False    7685
True      815
Name: count, dtype: int64

In [15]:
# TODO: check ParsedPublications (these seem to have the pdf url, hopefully names match) <== NOPE. just the direct link to the pdf. 
#  TODO: try creating a pub number and using that
#   TODO: use beautiful soup to look for href="*.pdf"?
 

# break into paragraph chunks

In [16]:
# TODO: break into paragraph chunks for better correlation 
#  TODO (OPTIONAL): clean excess characters
#  TODO (OPTIONAL): make sure french-friendly encoding is used (at least check if that makes a difference)
