In [88]:
import re
import bs4
import requests
import datetime
import pathlib

def download_pdfs(url_domain, url_path, url_sufix, save_path, year):
    # Set the URL, which changes depending on the year
    current_year = datetime.datetime.now().year
    url = url_domain + url_path
    if year != current_year:
        url += url_sufix + str(year)
        
    # Get the request from the bank URL
    website_request = requests.get(url)
    assert website_request.status_code == 200
    soup = bs4.BeautifulSoup(website_request.content)
    
    # Filter the HTML tags into only PDF links
    a_tags = soup.findAll('a')
    pdf_tags = list(filter(lambda a_tag : a_tag.text == 'PDF' , a_tags))
    
    # Create the PDF saving path if it doesn't exist
    save_path += f'/{year}'
    pathlib.Path(save_path).mkdir(parents=True, exist_ok=True)
    print(f'Starting to download {len(pdf_tags)} PDFs from {year} ({url_domain})...')
    
    # Loop through the PDFs and save them into the save path
    for i in range(len(pdf_tags)):
        
        # Set the pdf url extracted from the href attribute from a tags
        url_pdf = pdf_tags[i]['href']
        if not url_pdf.startswith('http'):
            url_pdf = url_domain + url_pdf
        
        # Request the PDF file and save it
        pdf_request = requests.get(url_pdf)
        assert pdf_request.status_code == 200
        pdf_file = open(save_path + f'/TD{year}_{i+1}.pdf', 'wb')
        pdf_file.write(pdf_request.content)
        pdf_file.close()
        
    print(f'Download successful: {len(pdf_tags)} PDFs from {year} ({url_domain})')

In [89]:
DOMAIN = "https://www.td.com"
URL = "/ca/en/about-td/for-investors/investor-relations/financial-information/financial-reports/quarterly-results"
URL_SUFIX = "/quarterly-results-"
SAVE_PATH = 'TDBank/'

download_pdfs(DOMAIN, URL, URL_SUFIX, SAVE_PATH, 1999)

Starting to download 7 PDFs from 1999 (https://www.td.com)...
Download successful: 7 PDFs from 1999 (https://www.td.com)


In [92]:
import os
url = "https://www.td.com/ca/en/about-td/for-investors/investor-relations/financial-information/financial-reports/quarterly-results"

folder_location = 'data/pdf_files'
if not os.path.exists(folder_location):
    os.mkdir(folder_location)

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")     
for link in soup.select("a[href$='.pdf']"):
    filename = os.path.join(folder_location, link['href'].split('/')[-1])
    with open(filename, 'wb') as f:
        f.write(requests.get(urljoin(url, link['href'])).content)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data/pdf_files'

In [1]:
from bs4 import BeautifulSoup
import requests

#link to all the documents for ScotiaBank 
link = 'https://www.scotiabank.com/ca/en/about/investors-shareholders/annual-reports.html'
data_path = '../data/Scotia/'

In [2]:
def get_pdf_url(url):
    # data_folder = os.path.join(data_folder,year)
    # if not os.path.exists(data_folder):
    #     os.makedirs(data_folder)
    base_url = re.findall(r'^.+?[^\/:](?=[?\/]|$)',url)[0]
    r = requests.get(url)
    if r.status_code == 200:
        soup = BeautifulSoup(r.content)
        links =  soup.findAll('a')
        links = [link.attrs['href'] for link in links if 'href' in link.attrs.keys() and link.attrs['href'].endswith('.pdf')]
        links = [link if link.startswith('http') else base_url+link for link in links]
        return set(links)
                

In [3]:
url = 'https://www.cibc.com/en/about-cibc/investor-relations/quarterly-results.html'
len(get_pdf_url(url))

638

In [4]:
def get_all_pdfs(url):
    # data_folder = os.path.join(data_folder,year)
    # if not os.path.exists(data_folder):
    #     os.makedirs(data_folder)
    base_url = re.findall(r'^.+?[^\/:](?=[?\/]|$)',url)[0]
    r = requests.get(url)
    if r.status_code == 200:
        soup = BeautifulSoup(r.content)
        links =  soup.findAll('a')
        pdfs = set()
        others = set()
        for link in links:
            if 'href' in link.attrs.keys():
                link = link.attrs['href']
                if not link.startswith('http'):
                    link = base_url + link
                if link.endswith('.pdf'):
                    print(link)
                    pdfs.add(link)
                else:
                    others.add(link)
        for other in others:
            try:
                out_pdf = get_pdf_url(other) if other.startswith(base_url) else 0
                
                if out_pdf:
                    pdfs.update(out_pdf)
            except TimeoutError:
                    print(link)
    else:
            pass
    print(len(pdfs))
    return pdfs
        
a = get_all_pdfs(url)
a

https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relations/pdfs/quarterly-results/2024/q324report-en.pdf
https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relations/pdfs/quarterly-results/2024/q324newsrelease-en.pdf
https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relations/pdfs/quarterly-results/2024/q324dividend-en.pdf
https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relations/pdfs/quarterly-results/2024/q324financials-en.pdf
https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relations/pdfs/quarterly-results/2024/q324disclosure-en.pdf
https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relations/pdfs/quarterly-results/2024/q324presentation-en.pdf
https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relations/pdfs/quarterly-results/2024/q324factsheet-en.pdf
https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relation

KeyboardInterrupt: 