In [3]:
from bs4 import BeautifulSoup
from pathlib import Path
import requests
import re
import os
import random

def get_base_url(url):
    # Extract the base url: protocol, subdomain, and domain
    base_re = r'^.+?[^\/:](?=[?\/]|$)'
    return re.findall(base_re, url)[0]

def get_pdf_urls(url):
    # Extract the base url: protocol, subdomain, and domain
    base_url = get_base_url(url)
    
    # Stop processing if the request wasn't successful
    try:
        r = requests.get(url)
    except:
        return set()
    
    if r.status_code != 200:
        return set()
    
    # Extract all a-tags from the website HTML
    bs = BeautifulSoup(r.content)
    a_tags =  bs.findAll('a')
    
    # Loop through the a-tags saving all href attributes that end with .pdf
    hrefs = []
    for tag in a_tags:
        if 'href' in tag.attrs.keys() and tag.attrs['href'].endswith('.pdf'):
            hrefs.append(tag.attrs['href'])
    
    # Convert all hrefs to URLs by adding the base URL if required
    pdf_links = {href if href.startswith('http') else base_url + href for href in hrefs}
    
    return pdf_links

def download_pdfs(urls, save_path, print_details=False):
    # Create/Save path where to store all PDFs
    path = Path(save_path)
    path.mkdir(parents=True, exist_ok=True)
    
    # Loop through the set of PDF URLs, get its content and save the files
    for i, pdf_url in enumerate(urls):
        original_filename = os.path.basename(pdf_url).split('/')[-1]
        rand_num = str(random.randrange(100000,199999))
        new_filename = original_filename[:-4] + '_' + rand_num + original_filename[-4:]
        file = Path(new_filename)
        if print_details:
            print(f"Downloading ({i+1}/{len(urls)}) PDF... ", end="")
        try:
            r = requests.get(pdf_url, stream=True)
        except:
            print("Error... " + pdf_url)
            continue
        if r.status_code != 200:
            print()
            continue
            
        with open(path.joinpath(file), 'wb') as f:
            f.write(r.content)
        if print_details:
            print("Successful... " + new_filename)
            
def get_urls(url):
    # Extract the base url: protocol, subdomain, and domain
    base_url = get_base_url(url)
    
    # Stop processing if the request wasn't successful
    r = requests.get(url)
    if r.status_code != 200:
        return set()
    
    # Extract all a-tags from the website HTML
    bs = BeautifulSoup(r.content)
    a_tags =  bs.findAll('a')
    
    # Extract content of href attribute from the a-tags and convert them into proper URLs using the base url
    hrefs = {tag.attrs['href'] for tag in a_tags if 'href' in tag.attrs.keys()}
    hrefs = {href if href.startswith('http') else base_url + href for href in hrefs}
    
    # Return only URLs that don't end with .pdf and that start with the base URL
    return {href for href in hrefs if href.startswith(base_url) and not href.endswith('.pdf')}

def extract_pdf_urls_from_url_recursive(url, save_path, remaining_levels, original_levels, print_details=False):
    # Get all PDF URLs
    pdf_urls = get_pdf_urls(url)
    
    # Print PDFs found in the main source
    if remaining_levels == original_levels and print_details:
        print( f"Depth Level 0 (Main Source) -> {len(pdf_urls)} PDFs found -> Source: {url}")
    
    # Download PDFs
    download_pdfs(pdf_urls, save_path, print_details)
    
    # If there's no remaining levels to dive, just return the PDF URLs found
    if remaining_levels == 0:
        return pdf_urls
    
    # Loop through the other URLs extracting PDF URLs from each one of them
    remaining_levels -= 1
    other_urls = get_urls(url)
    all_pdf_urls = set()
    for i, url_inside in enumerate(other_urls):
        pdfs_inside = extract_pdf_urls_from_url_recursive(url_inside, save_path, remaining_levels, original_levels, print_details)
        all_pdf_urls.update(pdfs_inside)
        depth_level = original_levels - remaining_levels
        if not print_details: continue
        print("..."*(depth_level-1) + f"Depth Level {depth_level} -> {i+1}/{len(other_urls)} URLs -> {len(all_pdf_urls)} PDFs found until now -> Source: {url_inside}")
    
    # Return the list of all PDF URLs found
    all_pdf_urls.update(pdf_urls)
    return all_pdf_urls

def extract_pdf_urls_from_url(url, levels, save_path, print_details=False):
    return extract_pdf_urls_from_url_recursive(url, save_path, remaining_levels=levels, original_levels=levels, print_details=print_details)

In [None]:
cibc_link = 'https://www.cibc.com/en/about-cibc/investor-relations/quarterly-results.html'
extract_pdf_urls_from_url(cibc_link, 2, 'data/ap3', print_details=True)

Downloading (1/638) PDF... Successful... q419factsheet-en_115876.pdf
Downloading (2/638) PDF... Successful... q423newsrelease-en_188362.pdf
Downloading (3/638) PDF... Successful... q220disclosure-en_146851.pdf
Downloading (4/638) PDF... Successful... q223financials-en_151615.pdf
Downloading (5/638) PDF... Successful... q319presentation-en_142034.pdf
Downloading (6/638) PDF... Successful... q318financials-en_174409.pdf
Downloading (7/638) PDF... Successful... q207dividend_107924.pdf
Downloading (8/638) PDF... Successful... q113financials_171239.pdf
Downloading (9/638) PDF... Error... https://www.cibc.com/content/dam/about_cibc/investor_relations/pdfs/quarterly_results/2005/q105newsrelease.pdf
Downloading (10/638) PDF... Successful... q115hlts_196223.pdf
Downloading (11/638) PDF... Successful... q110financials_116349.pdf
Downloading (12/638) PDF... Successful... q412faq-en_192150.pdf
Downloading (13/638) PDF... 

In [None]:
def download_pdfs_from_source_txt(source_path, save_path, levels, print_details):
    proper_source_path = Path(source_path)
    proper_save_path = Path(save_path)
    
    source_txt = open(proper_source_path)
    
    for i, line in enumerate(source_txt):
        link = line.strip()
        if print_details:
            print(f"Extracting Main Source #{i+1}: {link}")
        extract_pdf_urls_from_url(link, levels, proper_save_path, print_details=print_details)
    
    source_txt.close()
    
download_pdfs_from_source_txt('sources.txt', 'data/ap', levels=1, print_details=True)


Extracting Main Source #1: https://www.cibc.com/en/about-cibc/investor-relations/quarterly-results.html
Depth Level 0 (Main Source) -> 638 PDFs found -> Source: https://www.cibc.com/en/about-cibc/investor-relations/quarterly-results.html
Downloading (1/638) PDF... Successful... q209factsheet_161419.pdf
Downloading (2/638) PDF... Successful... q306presentation_178580.pdf
Downloading (3/638) PDF... Successful... q319financials-en_192069.pdf
Downloading (4/638) PDF... Successful... q118hlts-en_195208.pdf
Downloading (5/638) PDF... Successful... q308factsheet_140870.pdf
Downloading (6/638) PDF... Successful... q220dividend-en_163944.pdf
Downloading (7/638) PDF... Successful... q212report_160382.pdf
Downloading (8/638) PDF... Successful... q413financials_139376.pdf
Downloading (9/638) PDF... Successful... q110faq_115343.pdf
Downloading (10/638) PDF... Successful... q120hlts-en_169473.pdf
Downloading (11/638) PDF... Successful... q416presentation-en_143743.pdf
Downloading (12/638) PDF... Suc