In [1]:
import json
import os
import numpy as np
import pandas as pd
import re
import requests
import concurrent.futures

from bs4 import BeautifulSoup

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

# import website url data

In [70]:
links_folder = "website_reports"

dataframes = []

for file in os.listdir(links_folder):
    if file.endswith(".xlsx"):
        file_path = os.path.join(links_folder, file)
        df = pd.read_excel(file_path)
        dataframes.append(df)

column_names = ['type', 'year', 'pub_number', '_', 'nom', 'name', 'url_fr', 'url_en', '_', '_', '_', '_']
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.columns = column_names

types = ['RES', 'SAR', 'PRO', 'SSR', 'SCR', 'ESR', 'HSR']
combined_df = combined_df[combined_df.type.isin(types)]

# create formatted pub number with type and number
combined_df['pub_number'] = combined_df['type'] + " " + combined_df['pub_number']

columns = ['pub_number', 'year', 'nom', 'name', 'url_fr', 'url_en']
combined_df = combined_df[columns].reset_index(drop=True)

In [3]:
# populate pdf filenames where they exist

combined_df['filename_fr'] = combined_df['url_fr'].str.split('/').str[-1]
combined_df['filename_en'] = combined_df['url_en'].str.split('/').str[-1]
combined_df.loc[~combined_df['filename_fr'].str.endswith('.pdf'), 'filename_fr'] = None
combined_df.loc[~combined_df['filename_en'].str.endswith('.pdf'), 'filename_en'] = None

# add file_url columns
combined_df['file_url_fr'] = np.where(combined_df['filename_fr'], combined_df['url_fr'], None)
combined_df['file_url_en'] = np.where(combined_df['filename_en'], combined_df['url_en'], None)

# display(combined_df[combined_df['filename_fr'].isna()].sample(1).T)
# display(combined_df[combined_df['filename_fr'].notna()].sample(1).T)

# import parsed publication url data

In [4]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")
min_year, max_year = 1977, 2024
data = []

def process_file(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
        return {
            'filename': json_data.get('name'),
            'year': json_data.get('publicationYear'),
            'url': json_data.get('url'),
        }

def process_folder(year_path):
    file_data = []
    for json_file in os.listdir(year_path):
        if json_file.endswith(".json"):
            json_path = os.path.join(year_path, json_file)
            file_data.append(process_file(json_path))
    return file_data

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for year_folder in os.listdir(parsed_docs_folder):
        if year_folder.isnumeric() and min_year <= int(year_folder) <= max_year:
            year_path = os.path.join(parsed_docs_folder, year_folder)
            if os.path.isdir(year_path):
                futures.append(executor.submit(process_folder, year_path))

    for future in concurrent.futures.as_completed(futures):
        data.extend(future.result())

parsed_docs_df = pd.DataFrame(data) # this took 3 seconds
unmatched_url = set(parsed_docs_df['url'].to_list())

# confirm all url are distinct
parsed_docs_df['url'].value_counts().value_counts()

count
1    12752
Name: count, dtype: int64

# crawl bs4 to get remaining filename_fr and filename_en

In [56]:
# what are the different link suffices? -> 'html', 'htm', 'pdf' 
#  exclude pdf or not htm/html
display(combined_df['url_en'].str.endswith(('html', 'htm')).value_counts())
display(combined_df['url_en'].str.endswith('pdf').value_counts())
display(combined_df['url_fr'].str.endswith(('html', 'htm')).value_counts())
display(combined_df['url_fr'].str.endswith('pdf').value_counts())

url_en
True     7692
False    1369
Name: count, dtype: int64

url_en
False    7692
True     1369
Name: count, dtype: int64

url_fr
True     7692
False    1369
Name: count, dtype: int64

url_fr
False    7692
True     1369
Name: count, dtype: int64

In [52]:
errors = dict()


def find_pdf_link(url, debug=False):
    global errors
    
    if url.split('.')[-1].lower() not in ['html', 'htm']:
        if debug:
            errors['debug_pdf'] = f'{url=}'
        return None
    
    if debug:
        errors = dict()
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        pdf_links = re.findall(r'http[s]?://[^\'"<>]+\.pdf', str(soup), re.IGNORECASE)
        
        if debug:
            errors['debug_soup'] = soup
        if debug:
            errors['debug_pdf_links'] = pdf_links
            
        for link in pdf_links:
            if debug:
                errors['debug_link'] = link
                
            if link in unmatched_url:
                return link
            elif debug:
                print(f'{link=} not in unmatched_url')
                
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        errors[url] = f"find_pdf_link Error: {e}"
        
    return None

In [59]:
# conditionally load combined_df from csv

output_file = 'updated_parsed_docs.csv'

if os.path.exists(output_file):
    df = pd.read_csv(output_file)
    print("File loaded successfully.")
    print(df.head())
else:
    print(f"{output_file} does not exist.")

updated_parsed_docs.csv does not exist.


In [None]:
batch_size = 100
save_batch = True

for index, row in combined_df.iterrows():
    if save_batch:
        if index % batch_size == 0:
            combined_df.to_csv(output_file, index=False)
            print(f"Progress saved after {index + 1} rows.")
    
    if (row['filename_fr'] and row['filename_en']) or (row['file_url_fr'] and row['file_url_en']):
        continue
    
    pdf_link_fr = None
    pdf_link_en = None
    
    if pd.isna(row['filename_fr']):
        pdf_link_fr = find_pdf_link(row['url_fr'])
        if pdf_link_fr:
            combined_df.at[index, 'file_url_fr'] = pdf_link_fr
            if pdf_link_fr in unmatched_url:
                unmatched_url.discard(pdf_link_fr)
            else:
                errors[pdf_link_fr] = 'Link was added to combined_df but not in unmatched_url (fr)'

    if pd.isna(row['filename_en']):
        if row['url_en'] != row['url_fr']:
            pdf_link_en = find_pdf_link(row['url_en'])
            if pdf_link_en:
                combined_df.at[index, 'file_url_en'] = pdf_link_en
                if pdf_link_en in unmatched_url:
                    unmatched_url.discard(pdf_link_en)
                else:
                    errors[pdf_link_en] = 'Link was added to combined_df but not in unmatched_url (en)'
        else:
            if pdf_link_fr:
                combined_df.at[index, 'file_url_en'] = pdf_link_fr


if save_batch:
    combined_df.to_csv(output_file, index=False)

# break into paragraph chunks
* do a bit of research first to check what style and format chunks are best for translation finetuning

In [5]:
# TODO: break into paragraph chunks for better correlation 
#  TODO (OPTIONAL): clean excess characters
#  TODO (OPTIONAL): make sure french-friendly encoding is used (at least check if that makes a difference)
