## Extracting all PDF URLs (1 level deep) from a single URL passed as parameter

In [59]:
from bs4 import BeautifulSoup
import requests
import re

def get_base_url(url):
    # Extract the base url: protocol, subdomain, and domain
    base_re = r'^.+?[^\/:](?=[?\/]|$)'
    return re.findall(base_re, url)[0]

def get_pdf_urls(url):
    # Extract the base url: protocol, subdomain, and domain
    base_url = get_base_url(url)
    
    # Stop processing if the request wasn't successful
    r = requests.get(url)
    if r.status_code != 200:
        return set()
    
    # Extract all a-tags from the website HTML
    bs = BeautifulSoup(r.content)
    a_tags =  bs.findAll('a')
    
    # Loop through the a-tags saving all href attributes that end with .pdf
    hrefs = []
    for tag in a_tags:
        if 'href' in tag.attrs.keys() and tag.attrs['href'].endswith('.pdf'):
            hrefs.append(tag.attrs['href'])
    
    # Convert all hrefs to URLs by adding the base URL if required
    pdf_links = [href if href.startswith('http') else base_url + href for href in hrefs]
    return set(pdf_links)

def get_urls(url):
    # Extract the base url: protocol, subdomain, and domain
    base_url = get_base_url(url)
    
    # Stop processing if the request wasn't successful
    r = requests.get(url)
    if r.status_code != 200:
        return
    
    # Extract all a-tags from the website HTML
    bs = BeautifulSoup(r.content)
    a_tags =  bs.findAll('a')
    
    # Extract content of href attribute from the a-tags and convert them into proper URLs using the base url
    hrefs = {tag.attrs['href'] for tag in a_tags if 'href' in tag.attrs.keys()}
    hrefs = {href if href.startswith('http') else base_url + href for href in hrefs}
    
    # Return only URLs that don't end with .pdf and that start with the base URL
    return {href for href in hrefs if href.startswith(base_url) and not href.endswith('.pdf')}

def extract_pdf_urls_from_url(url, levels = 1):
    # Get all pdf URLs and other URLs
    pdf_urls = get_pdf_urls(url)
    other_urls = get_urls(url)
    
    # Loop through the other URLs extracting PDF URLs from each one of them
    for i, url in enumerate(other_urls):
        print(f"{i+1}/{len(other_urls)} (#PDFs until now: {len(pdf_urls)}): {url}")
        pdfs_inside_url = get_pdf_urls(url)
        pdf_urls.update(pdfs_inside_url)
    
    # Return the list of all PDF URLs
    return pdf_urls


url = 'https://www.cibc.com/en/about-cibc/investor-relations/quarterly-results.html'
extract_pdf_urls_from_url(url)


1/184 (#PDFs until now: 638): https://www.cibc.com/content/dam/about_cibc/investor_relations/pdfs/quarterly_results/2014/q314disclosureexcel.xls
2/184 (#PDFs until now: 638): https://www.cibc.com/en/privacy-security/cookie-policy.html
3/184 (#PDFs until now: 638): https://www.cibc.com/content/dam/about_cibc/investor_relations/pdfs/quarterly_results/2015/sfi-january15.xls
4/184 (#PDFs until now: 638): https://www.cibc.com/content/dam/about_cibc/investor_relations/pdfs/quarterly_results/2018/sfi-july18-en.xls
5/184 (#PDFs until now: 638): https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/investor-relations/pdfs/quarterly-results/2023/sfi-january23-en.xlsx
6/184 (#PDFs until now: 638): https://www.cibc.com/content/dam/about_cibc/investor_relations/pdfs/quarterly_results/2018/q118disclosureexcel-en.xlsx
7/184 (#PDFs until now: 638): https://www.cibc.com/en/business/cash-management.html
8/184 (#PDFs until now: 639): https://www.cibc.com/en/about-cibc/careers/banking-centre-care

KeyboardInterrupt: 