In [1]:
import json
import os
import pandas as pd
import concurrent.futures

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

# import website url data

In [2]:
links_folder = "website_reports"

dataframes = []

for file in os.listdir(links_folder):
    if file.endswith(".xlsx"):
        file_path = os.path.join(links_folder, file)
        df = pd.read_excel(file_path)
        dataframes.append(df)

column_names = ['type', 'year', 'pub_number', '_', 'nom', 'name', 'url_fr', 'url_en', '_', '_', '_', '_']
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = column_names

types = ['RES', 'SAR', 'PRO', 'SSR', 'SCR', 'ESR', 'HSR']
combined_df = combined_df[combined_df.type.isin(types)]

# create formatted pub number with type and number
combined_df['pub_number'] = combined_df['type'] + " " + combined_df['pub_number']

columns = ['pub_number', 'year', 'nom', 'name', 'url_fr', 'url_en']
combined_df = combined_df[columns]

# how many are html links vs pdf links?
display(combined_df['url_en'].str.endswith(('html', 'htm')).value_counts())
display(combined_df['url_en'].str.endswith('pdf').value_counts())

url_en
True     7692
False    1369
Name: count, dtype: int64

url_en
False    7692
True     1369
Name: count, dtype: int64

In [3]:
# populate pdf filenames where they exist

combined_df['filename_fr'] = combined_df['url_fr'].str.split('/').str[-1]
combined_df['filename_en'] = combined_df['url_en'].str.split('/').str[-1]
combined_df.loc[~combined_df['filename_fr'].str.endswith('.pdf'), 'filename_fr'] = None
combined_df.loc[~combined_df['filename_en'].str.endswith('.pdf'), 'filename_en'] = None

display(combined_df[combined_df['filename_fr'].isna()].sample(1).T)
display(combined_df[combined_df['filename_fr'].notna()].sample(1).T)

Unnamed: 0,3086
pub_number,RES 1987/088
year,1987
nom,
name,Results of the acoustic survey of herring stocks in NAFO Divisions 4T and 4Vn - November 1986
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1987/1987_088-fra.html
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/publications/resdocs-docrech/1987/1987_088-eng.html
filename_fr,
filename_en,


Unnamed: 0,19970
pub_number,SSR 2003/020
year,2003
nom,Morue de l'est du plateau néo-écossais
name,Eastern Scotian Shelf Cod
url_fr,http://waves-vagues.dfo-mpo.gc.ca/Library/271083.pdf
url_en,http://waves-vagues.dfo-mpo.gc.ca/Library/273373.pdf
filename_fr,271083.pdf
filename_en,273373.pdf


# import parsed publication url data

In [4]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")
min_year, max_year = 1977, 2024
data = []

def process_file(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
        return {
            'filename': json_data.get('name'),
            'year': json_data.get('publicationYear'),
            'url': json_data.get('url'),
        }

def process_folder(year_path):
    file_data = []
    for json_file in os.listdir(year_path):
        if json_file.endswith(".json"):
            json_path = os.path.join(year_path, json_file)
            file_data.append(process_file(json_path))
    return file_data

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for year_folder in os.listdir(parsed_docs_folder):
        if year_folder.isnumeric() and min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            if os.path.isdir(year_path):
                futures.append(executor.submit(process_folder, year_path))

    for future in concurrent.futures.as_completed(futures):
        data.extend(future.result())

parsed_docs_df = pd.DataFrame(data) # this took 3 seconds
unmatched_url = set(parsed_docs_df['url'].to_list())

# confirm all url are distinct
parsed_docs_df['url'].value_counts().value_counts()

count
1    12752
Name: count, dtype: int64

# crawl bs4 to get remaining filename_fr and filename_en

# break into paragraph chunks

In [5]:
# TODO: break into paragraph chunks for better correlation 
#  TODO (OPTIONAL): clean excess characters
#  TODO (OPTIONAL): make sure french-friendly encoding is used (at least check if that makes a difference)
