In [1]:
import json
import os
import pandas as pd

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

# import website url data

In [2]:
links_folder = "website_reports"

dataframes = []

for file in os.listdir(links_folder):
    if file.endswith(".xlsx"):
        file_path = os.path.join(links_folder, file)
        df = pd.read_excel(file_path)
        dataframes.append(df)

column_names = ['type', 'year', 'pub_number', '_', 'nom', 'name', 'url_fr', 'url_en', '_', '_', '_', '_']
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = column_names

types = ['RES', 'SAR', 'PRO', 'SSR', 'SCR', 'ESR', 'HSR']
combined_df = combined_df[combined_df.type.isin(types)]

# create formatted pub number with type and number
combined_df['pub_number'] = combined_df['type'] + " " + combined_df['pub_number']

columns = ['pub_number', 'year', 'nom', 'name', 'url_fr', 'url_en']
combined_df = combined_df[columns]

# how many are html links vs pdf links?
display(combined_df['url_en'].str.endswith(('html', 'htm')).value_counts())
display(combined_df['url_en'].str.endswith('pdf').value_counts())

url_en
True     7692
False    1369
Name: count, dtype: int64

url_en
False    7692
True     1369
Name: count, dtype: int64

In [3]:
# populate pdf filenames where they exist

combined_df['filename_fr'] = combined_df['url_fr'].str.split('/').str[-1]
combined_df['filename_en'] = combined_df['url_en'].str.split('/').str[-1]
combined_df.loc[~combined_df['filename_fr'].str.endswith('.pdf'), 'filename_fr'] = None
combined_df.loc[~combined_df['filename_en'].str.endswith('.pdf'), 'filename_en'] = None

In [4]:
combined_df.sample(1).T

Unnamed: 0,5903
pub_number,RES 1998/030
year,1998
nom,
name,"Status of Atlantic salmon stocks of southwest New Brunswick, 1996."
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1998/1998_030-fra.htm
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1998/1998_030-eng.htm
filename_fr,
filename_en,


# import parsed publication url data

In [8]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")

min_year, max_year = 1977, 2024
data = []

for year_folder in os.listdir(parsed_docs_folder):
    if year_folder.isnumeric():
        if min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            print(f"Analysing: {year_path}")
            
            if os.path.isdir(year_path):
                for json_file in os.listdir(year_path):
                    if json_file.endswith(".json"):
                        json_path = os.path.join(year_path, json_file)
                        with open(json_path, 'r', encoding='utf-8') as file:
                            json_data = json.load(file)
                            
                            filename = json_data.get('name')
                            year = json_data.get('publicationYear')
                            url = json_data.get('url')
                            
                            data.append({
                                'filename': filename,
                                'year': year,
                                'url': url,
                            })

parsed_docs_df = pd.DataFrame(data) # this took 3 minutes (7s after run once)
unmatched_url = set(parsed_docs_df['url'].to_list()) 

parsed_docs_df['url'].value_counts().value_counts()  # all distinct

Analysing: ..\ParsedPublications\1977
Analysing: ..\ParsedPublications\1978
Analysing: ..\ParsedPublications\1979
Analysing: ..\ParsedPublications\1980
Analysing: ..\ParsedPublications\1981
Analysing: ..\ParsedPublications\1982
Analysing: ..\ParsedPublications\1983
Analysing: ..\ParsedPublications\1984
Analysing: ..\ParsedPublications\1985
Analysing: ..\ParsedPublications\1986
Analysing: ..\ParsedPublications\1987
Analysing: ..\ParsedPublications\1988
Analysing: ..\ParsedPublications\1989
Analysing: ..\ParsedPublications\1990
Analysing: ..\ParsedPublications\1991
Analysing: ..\ParsedPublications\1992
Analysing: ..\ParsedPublications\1993
Analysing: ..\ParsedPublications\1994
Analysing: ..\ParsedPublications\1995
Analysing: ..\ParsedPublications\1996
Analysing: ..\ParsedPublications\1997
Analysing: ..\ParsedPublications\1998
Analysing: ..\ParsedPublications\1999
Analysing: ..\ParsedPublications\2000
Analysing: ..\ParsedPublications\2001
Analysing: ..\ParsedPublications\2002
Analysing: .

count
1    12752
Name: count, dtype: int64

In [9]:
# faster maybe  

import concurrent.futures

parsed_docs_folder = os.path.join("..", "ParsedPublications")
min_year, max_year = 1977, 2024
data = []

def process_file(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
        return {
            'filename': json_data.get('name'),
            'year': json_data.get('publicationYear'),
            'url': json_data.get('url'),
        }

def process_folder(year_path):
    file_data = []
    for json_file in os.listdir(year_path):
        if json_file.endswith(".json"):
            json_path = os.path.join(year_path, json_file)
            file_data.append(process_file(json_path))
    return file_data

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for year_folder in os.listdir(parsed_docs_folder):
        if year_folder.isnumeric() and min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            if os.path.isdir(year_path):
                futures.append(executor.submit(process_folder, year_path))

    for future in concurrent.futures.as_completed(futures):
        data.extend(future.result())

parsed_docs_df_v2 = pd.DataFrame(data) # this took 3 seconds
unmatched_url_v2 = set(parsed_docs_df_v2['url'].to_list())

In [10]:
# Find rows in parsed_docs_df not in parsed_docs_df_v2
diff_1 = parsed_docs_df.merge(parsed_docs_df_v2, how='outer', indicator=True).query("_merge == 'left_only'")

# Find rows in parsed_docs_df_v2 not in parsed_docs_df
diff_2 = parsed_docs_df.merge(parsed_docs_df_v2, how='outer', indicator=True).query("_merge == 'right_only'")

if diff_1.empty and diff_2.empty:
    print("DataFrames are identical.")
else:
    print("Differences found:")
    print("In parsed_docs_df but not in parsed_docs_df_v2:")
    print(diff_1)
    print("In parsed_docs_df_v2 but not in parsed_docs_df:")
    print(diff_2)
    
if set(unmatched_url) == set(unmatched_url_v2):
    print("Lists are identical (ignoring order).")
else:
    print("Lists are different.")


DataFrames are identical.
Lists are identical (ignoring order).


In [8]:
# confirm parsed_docs_df['url'] are all unique (they are)
#  TODO: make sure when deleting from set, both fr and en links are accounted for


count
1    12752
Name: count, dtype: int64

# crawl bs4 to get remaining filename_fr and filename_en

# break into paragraph chunks

In [9]:
# TODO: break into paragraph chunks for better correlation 
#  TODO (OPTIONAL): clean excess characters
#  TODO (OPTIONAL): make sure french-friendly encoding is used (at least check if that makes a difference)
