In [1]:
import json
import os
import random
import re
import tempfile
import pandas as pd

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

# import url data

In [2]:
links_folder = "website_reports"

dataframes = []

for file in os.listdir(links_folder):
    if file.endswith(".xlsx"):
        file_path = os.path.join(links_folder, file)
        df = pd.read_excel(file_path)
        dataframes.append(df)

column_names = ['type', 'year', 'pub_number', '_', 'nom', 'name', 'url_fr', 'url_en', '_', '_', '_', '_']
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = column_names

types = ['RES', 'SAR', 'PRO', 'SSR', 'SCR', 'ESR', 'HSR']
combined_df = combined_df[combined_df.type.isin(types)]
combined_df = combined_df.loc[:, combined_df.columns != '_']

combined_df.sample(10)

Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en
2162,RES,1983,1983/047,,Eastern Scotian Shelf haddock (4VW): Stock status in 1982 and projections to 1984,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1983/1983_047-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1983/1983_047-eng.html
18039,SAR,2016,2016/037,"État de la population de béluga (Delphinapterus leucas) de la baie Cumberland, au Nunavut","Status of beluga (Delphinapterus leucas) in Cumberland Sound, Nunavut",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2016/2016_037-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/SAR-AS/2016/2016_037-eng.html
2188,RES,1983,1983/059,,Preliminary analysis of A.T. Cameron - Lady Hammond comparative fishing experiments 1979-81,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1983/1983_059-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1983/1983_059-eng.html
4554,RES,1994,1994/076,,Survey update for selected Scotia-Fundy groundfish stocks 12 September 1994,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1994/1994_076-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1994/1994_076-eng.html
5447,RES,1997,1997/004,,Oceanographic conditions in the Newfoundland region during 1996 with comparisons to the 1961-1990 average.,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1997/1997_004-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1997/1997_004-eng.htm
1377,PRO,2022,2022/031,"Compte rendu de la réunion sur les avis scientifiques régional sur la surveillance de la zone de protection marine du Gully : Examen des activités de recherche, des indicateurs et des orientations...","Proceedings of the Regional Advisory Meeting on the Gully Marine Protected Area Monitoring: Review of Research Activities, Indicators, and Guidance on Next Steps; January 18-22 and October 12, 2021",https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2022/2022_031-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2022/2022_031-eng.html
13782,RES,2016,2016/017,Modèle de croissance de Von Bertalanffy d'une série chronologique de la cohorte de morues du Nord (Gadus morhua) et estimation de l'âge des morues marquées,"A cohort time-series Von Bertalanffy growth model for Northern cod (Gadus morhua), and estimation of the age of tagged cod",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2016/2016_017-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/2016/2016_017-eng.html
1179,PRO,2017,2017/040,Compte rendu de la réunion nationale d'examen par les pairs sur les Lignes directrices scientifiques relatives à la politique sur la protection des pêches : avis sur les techniques de compensation...,"Proceedings on the National Peer Review of Science guidance for Fisheries Protection Policy: Advice on Offsetting Techniques Managing the Productivity of Freshwater Fisheries; June 4-6, 2013",http://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2017/2017_040-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2017/2017_040-eng.html
925,PRO,2013,2013/041,Compte rendu de la réunion du Processus consultatif scientifique de la Région des Maritimes au sujet de l'évaluation du stock de pétoncle du banc de Georges et du banc de Browns; 1 mai 2013,Proceedings of the Maritimes Region Science Advisory Process on the Assessment of Georges and Browns Bank Scallop; 1 May 2013,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2013/2013_041-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2013/2013_041-eng.html
6580,RES,1999,1999/105,,Catch-at-Age of Northwest Atlantic Harp Seals 1952 - 1998.,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_105-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1999/1999_105-eng.htm


In [3]:
# most docs with url_fr == url_en seem to be documents with translations in the same doc 
# DO NOT REMOVE THESE

combined_df[combined_df.url_fr == combined_df.url_en].sample(10)

Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en
44,PRO,1997,1997/019,,Fall Meeting: Marine Fisheries Subcommittee - Regional Advisory Process (RAP) of the Maritimes Region; 6-10 Octoer 1997,http://waves-vagues.dfo-mpo.gc.ca/Library/227009.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/227009.pdf
120,PRO,1999,1999/007,"Procès-verbaux des séances d'examen par les pairs pour le crabe des neiges du sud du Golfe St-Laurent, du 22 au 23 janvier 1998 et du 23 avril 1998",Proceedings of the Peer Review of Snow Crab in the Southern Gulf of St. Lawrence;22-23 January 1998 and 23 April 1998,http://waves-vagues.dfo-mpo.gc.ca/Library/234918.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/234918.pdf
41,PRO,1997,1997/024,,"Report of the PSARC Herring Subcommittee Meeting September 3-5, 1997 and the Steering Committee Meeting September 24, 1997",http://waves-vagues.dfo-mpo.gc.ca/Library/226303.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/226303.pdf
72,PRO,1998,1998/019,,"Report of the PSARC Groundfish Subcommittee Meeting November 23-26, 1998 and the Steering Committee Meeting December 16, 1998",http://waves-vagues.dfo-mpo.gc.ca/Library/234940.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/234940.pdf
29,PRO,1996,1996/005,,Proceedings of the 1996 Newfoundland Regional Shellfish Assessment,http://waves-vagues.dfo-mpo.gc.ca/Library/237125.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/237125.pdf
335,PRO,2003,2003/027,Compte rendu d'une réunion du Processus consultatif régional des provinces Maritimes sur le stock de crevette nordique de l'est du plateau néo-écossais (ZPC 13-15); le 25 novembre 2002.,Proceedings of a Maritimes Regional Advisory Process Meeting on Northern Shrimp on the Eastern Scotian Shelf (SFA 13-15); 25 November 2002.,http://waves-vagues.dfo-mpo.gc.ca/Library/277193.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/277193.pdf
579,PRO,2007,2007/040,"Compte rendu de l'atelier zonal, Examen de l'information dont dispose le secteur des Sciences du MPO sur le sébaste (Sebaste sp.) pour l'évaluation du COSEPAC, Discrimination des stocks de sébaste...","Proceedings of the Zonal Workshop Review of the Information on Redfish (Sebastes sp.) available from DFO Science Sector for the COSEWIC Assessment, Discrimination of Redfish Stocks of Units 1 and ...",http://waves-vagues.dfo-mpo.gc.ca/Library/343518.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/343518.pdf
19689,SSR,1999,1999/B2-02,,Stock Status Report Capelin in Subarea 2 + Div. 3KL,http://waves-vagues.dfo-mpo.gc.ca/Library/238443.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/238443.pdf
248,PRO,2001,2001/002,,"Newfoundland 2000 Shellfish RAP Proceedings; February 29 to March 2, 2000.",http://waves-vagues.dfo-mpo.gc.ca/Library/253405.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/253405.pdf
19515,SSR,1997,1997/C2-05,,Northern Shrimp off Newfoundland and Labrador,http://waves-vagues.dfo-mpo.gc.ca/Library/211713.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/211713.pdf


In [4]:
# how many pdf links are missing?

combined_df[combined_df['url_en'].str.endswith(('html', 'htm'))]['type'].value_counts()

type
RES    5064
SAR    1128
PRO     843
SCR     656
ESR       1
Name: count, dtype: int64

In [5]:
# create formatted pub number with type and number
combined_df['formatted_pub_number'] = combined_df['type'] + " " + combined_df['pub_number']

combined_df.sample(1).T

Unnamed: 0,1862
type,RES
year,1981
pub_number,1981/020
nom,
name,An update of the inshore cod stock in Subdivision 4Vn (May-Dec) for 1980
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1981/1981_020-fra.html
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1981/1981_020-eng.html
formatted_pub_number,RES 1981/020


In [26]:
# how many are html links vs pdf links?
display(combined_df['url_en'].str.endswith(('html', 'htm')).value_counts())
display(combined_df['url_en'].str.endswith('pdf').value_counts())

url_en
True     7692
False    1369
Name: count, dtype: int64

url_en
False    7692
True     1369
Name: count, dtype: int64

# import ParsedPublication data

In [6]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")

type_dict = {
    'RES': ['research document', 'res. doc.', 'res doc', 'res.doc.', 'res no.', 'resdoc', 'res.', 'res', 'documents de recherche', 'document de recherche'],
    'SCR': ['science response', 'science response report', 'réponse des sciences', 'réponses des sciences', 'réponse scientifique'],
    'PRO': ['proceedings series', 'compte rendu', 'compte rendus', 'comptes rendu', 'comptes rendus'],
    'SAR': ['science advisory report', 'avis scientifique'],
}
data = []
errors = dict()

def extract_pub_number(text, filename):
    text_snippet = re.sub(r"\s+", " ", text[:2000].lower())
    

    for key, phrases in type_dict.items():
        pattern = re.compile(r"(" + "|".join(re.escape(phrase.lower()) for phrase in phrases) + r")", flags=re.IGNORECASE)
        text_snippet = pattern.sub(key, text_snippet)
    

    patterns = [rf"{key}\.?(?:\s+no\.)?\s+\d{{2,4}}\s*/\s*\d{{1,3}}" for key in type_dict.keys()]

    
    for pattern in patterns:
        match = re.search(pattern, text_snippet, re.IGNORECASE)
        if match:
            pub_number = match.group()
            
            # Normalize 2-digit years to 4 digits (e.g., 77 -> 1977)
            pub_number = re.sub(r" (\d{2})/", lambda m: f" 19{m.group(1)}/" if int(m.group(1)) <= 99 else f" {m.group(1)}/", pub_number)
            
            # Zero-pad publication numbers (e.g., 15 -> 015)
            pub_number = re.sub(r"/(\d{1,2})(?!\d)", lambda m: f"/{int(m.group(1)):03}", pub_number)
            return pub_number
    
    errors[filename] = " ".join(text_snippet.split())
    return None

for year_folder in os.listdir(parsed_docs_folder):
    min_year = 1977
    max_year = 2024
    
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            url = json_data.get('url')
                            year = json_data.get('publicationYear')
                            text = json_data.get('text', '')
                            pub_number = extract_pub_number(text, filename)
                            
                            data.append({
                                'filename': filename,
                                'url': url,
                                'year': year,
                                'pub_number': pub_number
                            })

parsed_docs_df = pd.DataFrame(data)

print('\nnumber of missing pub numbers:', len(errors))

Analysing: ..\ParsedPublications\1977
Analysing: ..\ParsedPublications\1978
Analysing: ..\ParsedPublications\1979
Analysing: ..\ParsedPublications\1980
Analysing: ..\ParsedPublications\1981
Analysing: ..\ParsedPublications\1982
Analysing: ..\ParsedPublications\1983
Analysing: ..\ParsedPublications\1984
Analysing: ..\ParsedPublications\1985
Analysing: ..\ParsedPublications\1986
Analysing: ..\ParsedPublications\1987
Analysing: ..\ParsedPublications\1988
Analysing: ..\ParsedPublications\1989
Analysing: ..\ParsedPublications\1990
Analysing: ..\ParsedPublications\1991
Analysing: ..\ParsedPublications\1992
Analysing: ..\ParsedPublications\1993
Analysing: ..\ParsedPublications\1994
Analysing: ..\ParsedPublications\1995
Analysing: ..\ParsedPublications\1996
Analysing: ..\ParsedPublications\1997
Analysing: ..\ParsedPublications\1998
Analysing: ..\ParsedPublications\1999
Analysing: ..\ParsedPublications\2000
Analysing: ..\ParsedPublications\2001
Analysing: ..\ParsedPublications\2002
Analysing: .

In [7]:
parsed_docs_df.sample(10)

Unnamed: 0,filename,url,year,pub_number
7953,348870.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/348870.pdf,2013,SCR 2013/006
4190,272427.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/272427.pdf,2003,
1739,196229.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/196229.pdf,1996,
950,110727.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/110727.pdf,1989,RES 1989/019
5965,337722.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/337722.pdf,2008,PRO 2009/006
479,326165.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/326165.pdf,1984,RES 1984/061
7746,348762.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/348762.pdf,2012,SAR 2012/071
2851,238336-app3.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/238336-app3.pdf,1999,
2560,229660.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/229660.pdf,1998,
4316,279334.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/279334.pdf,2003,RES 2003/081


In [8]:
# check some random errors (mostly look like stock status reports)

random_error = random.choice(list(errors.keys()))

print(random_error)
print(errors[random_error])

270405.pdf
région du golfe rapport sur l’état des stocks c3-04(2002) août 2002 crabe commun du sud du golfe du saint-laurent (zones de pêche du homard 23, 24, 25, 26a et 26b) renseignements de base l’aire de distribution du crabe commun (cancer irroratus) s’étend de la zone intertidale jusqu’à une profondeur de 40 mètRES la croissance s’effectue au moyen de mues périodiques où le crabe se défait de sa carapace externe dure. la mue ralentit à mesure que le crabe commun vieillit et atteint sa maturité sexuelle. en moyenne, les crabes communs femelles et mâles parviennent à maturité à 57 et 75 mm RESpectivement. la femelle transporte les œufs sous son abdomen jusqu’à l’éclosion et les larves sont libérées dans la colonne d’eau où elles demeurent de la mi-juin à la mi-septembre avant de se fixer. les cinq zones de pêche du crabe commun sont les mêmes que les zones de pêche du homard (zph) : 23, 24, 25, 26a, et 26b. ces zones ne sont pas établies en fonction de la biologie du crabe commun, 

In [9]:
def display_errors_in_notepad(errors):
    n_char = 400
    
    with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8') as temp_file:
        for k, v in errors.items():
            v = v[:n_char].lower()
            
            if "stock status report" not in v:
                temp_file.write(f'++++++++++++++++++++ {k} ++++++++++++++++++++\n')
                temp_file.write(f'{v}\n\n')
        temp_file_name = temp_file.name
    
    os.system(f'notepad {temp_file_name}')
    os.unlink(temp_file_name)


display_errors_in_notepad(errors)

In [10]:
parsed_docs_df['pub_number'].isna().value_counts()

pub_number
False    10915
True      1837
Name: count, dtype: int64

# how many parsedpublications matches are missing for documents with translations?

In [11]:
missing_pub_numbers = [x for x in combined_df.formatted_pub_number.to_list() if x not in parsed_docs_df.pub_number.to_list()]

len(missing_pub_numbers)

1454

In [12]:
combined_df[combined_df.formatted_pub_number.isin(missing_pub_numbers) & combined_df['url_en'].str.endswith(('html', 'htm'))].drop(['pub_number', 'name', 'nom'], axis=1).sample(20)

Unnamed: 0,type,year,url_fr,url_en,formatted_pub_number
3226,RES,1988,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1988/1988_047-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1988/1988_047-eng.html,RES 1988/047
4346,RES,1993,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/1993/1993_074-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/1993/1993_074-eng.html,RES 1993/074
1642,RES,1979,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1979/1979_028-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1979/1979_028-eng.html,RES 1979/028
2505,RES,1985,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1985/1985_034-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1985/1985_034-eng.html,RES 1985/034
1480,PRO,2024,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_043-fra.html,https://www.dfo-mpo.gc.ca/csas-sccs/Publications/Pro-Cr/2024/2024_043-eng.html,PRO 2024/043
4787,RES,1995,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1995/1995_054-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1995/1995_054-eng.html,RES 1995/054
1952,RES,1981,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1981/1981_074-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1981/1981_074-eng.html,RES 1981/074
1623,RES,1979,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1979/1979_014-fra.html,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1979/1979_014-eng.html,RES 1979/014
5728,RES,1997,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1997/1997_112-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1997/1997_112-eng.htm,RES 1997/112
6165,RES,1998,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1998/1998_125-fra.htm,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1998/1998_125-eng.htm,RES 1998/125


In [18]:
# manually check:

# errors['314807.pdf']  # this one is literally the only ESR with out PDF data, I didn't include it in the original script, but I could add manually: (id, fr, en) = (ESR 2004/006, 314807.pdf, 314806.pdf)
# a bunch have been withdrawn 
# errors['102275.pdf'] # not an error: translation is right inside the same document, single pdf (also 186156.pdf, 79442.pdf, and more)

errors['277.pdf']  # TODO: this doesn't match because the generated pub_number formatting was incorrect "Res. Doc. 81/ 51" was converted to "RES 1981/ 51", should be "RES 1981/051" 

KeyError: '277.pdf'

In [19]:
# TODO: somehow a pub_number of "RES 1981/ 51" was written instead of "RES 1981/051" 

parsed_docs_df[parsed_docs_df.filename == "277.pdf"]

Unnamed: 0,filename,url,year,pub_number
253,277.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/277.pdf,1981,RES 1981/ 51


In [20]:
# TODO: fix these mistakes
parsed_docs_df[parsed_docs_df['pub_number'].str.contains(r'\s.*\s', na=False)]

Unnamed: 0,filename,url,year,pub_number
6,75593.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75593.pdf,1977,RES no. 1977/009
7,75594.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75594.pdf,1977,RES no. 1977/010
8,75595.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75595.pdf,1977,RES no. 1977/011
14,75601.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75601.pdf,1977,RES no. 1977/017
16,75604.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/75604.pdf,1977,RES no. 1977/020
...,...,...,...,...
2331,227077.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/227077.pdf,1997,PRO 97 /009
2756,233840.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/233840.pdf,1999,RES 1999/ 1
3630,262788.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/262788.pdf,2001,PRO 2001/ 039
3728,264840.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/264840.pdf,2001,PRO 2001/ 041


# check single non-matching pdf for translations inside of the single pdf

In [24]:
# how many files are just a single link?

print(combined_df[combined_df.url_fr == combined_df.url_en].shape[0])

combined_df[combined_df.url_fr == combined_df.url_en].sample(10)

561


Unnamed: 0,type,year,pub_number,nom,name,url_fr,url_en,formatted_pub_number
19662,SSR,1999,1999/A6-07,,Yellowtail Rockfish,http://waves-vagues.dfo-mpo.gc.ca/Library/331709.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/331709.pdf,SSR 1999/A6-07
486,PRO,2006,2006/026,"Ateliers d'évaluation des dommages acceptables pour les espèces en péril d'eau douce dans la Région du Centre et de l'Arctique; les 18 et 19 octobre 2005, les 8 et 9 février 2006 et les 13 et 14 f...","Allowable Harm Analysis Workshops for Freshwater Species at Risk in Central and Arctic Region; October 18-19, 2005; February 8-9, 2006; and February 13-14, 2006.",http://waves-vagues.dfo-mpo.gc.ca/Library/325726.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/325726.pdf,PRO 2006/026
19513,SSR,1997,1997/B2-02,,Capelin in Subarea 2 + Div. 3KL,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/211863.pdf,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/211863.pdf,SSR 1997/B2-02
19108,SCR,2021,2021/014,Mise à jour de 2020 sur l'état des stocks de crevettes nordiques de l'est du plateau néo-écossais (ZPC 13-15),2020 Stock Status Update of the Eastern Scotian Shelf Northern Shrimp (SFAs 13-15),http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2021/2021_014-eng.html,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ScR-RS/2021/2021_014-eng.html,SCR 2021/014
515,PRO,2006,2006/038,Compte rendu de la réunion du sous-comité sur le poisson de fond du CEESP; 21 et 22 novembre 2006,"Proceedings of the PSARC Groundfish Subcommittee Meeting; November 21-22, 2006",http://waves-vagues.dfo-mpo.gc.ca/Library/327840.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/327840.pdf,PRO 2006/038
542,PRO,2007,2007/026,Compte rendu de la réunion du sous-comité des poissons pélagiques du CEESP; 17-18 janvier 2007,"Proceedings of the PSARC Groundfish Subcommittee Meeting; January 17-18, 2007",http://waves-vagues.dfo-mpo.gc.ca/Library/330757.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/330757.pdf,PRO 2007/026
417,PRO,2004,2004/029,,"Proceedings of the PSARC Pelagic Subcommittee Meeting; September 8, 2004",http://waves-vagues.dfo-mpo.gc.ca/Library/315903.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/315903.pdf,PRO 2004/029
32,PRO,1996,1996/001,,Proceedings of Peer Review and Client Consultations for Diadromous Fish Stocks in the Maritime Provinces in 1995,http://waves-vagues.dfo-mpo.gc.ca/Library/197005.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/197005.pdf,PRO 1996/001
120,PRO,1999,1999/007,"Procès-verbaux des séances d'examen par les pairs pour le crabe des neiges du sud du Golfe St-Laurent, du 22 au 23 janvier 1998 et du 23 avril 1998",Proceedings of the Peer Review of Snow Crab in the Southern Gulf of St. Lawrence;22-23 January 1998 and 23 April 1998,http://waves-vagues.dfo-mpo.gc.ca/Library/234918.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/234918.pdf,PRO 1999/007
501,PRO,2006,2006/035,Compte rendu des réunions du Processus consultatif des provinces Maritimes au sujet des stocks de poisson de fond de Scotia-Fundy; le 23 octobre 2006 et les 16 et 17 novembre 2006.,Proceedings of the Maritimes Regional Advisory Process on the Assessments of Scotia-Fundy Groundfish Stocks; 23 October 2006 and 16 - 17 November 2006.,http://waves-vagues.dfo-mpo.gc.ca/Library/326920.pdf,http://waves-vagues.dfo-mpo.gc.ca/Library/326920.pdf,PRO 2006/035


# break into paragraph chunks

In [17]:
# TODO: break into paragraph chunks for better correlation 
#  TODO (OPTIONAL): clean excess characters
#  TODO (OPTIONAL): make sure french-friendly encoding is used (at least check if that makes a difference)
