In [4]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import os
from urllib.parse import urljoin

In [5]:
def get_all_links(url, visited, max_depth,base_url,depth = 1):
    if depth > max_depth or url in visited: # if we've seen this link already skip
        return []

    visited.add(url) #add to set

    response = requests.get(url)
    # look through the content
    soup = BeautifulSoup(response.text, 'html.parser')
    #look for hyperlinks
    links = soup.find_all('a')
    # print(f'Found {len(links)} links at depth {depth} from {url}')
    all_links = []
    for link in links:
        href = link.get('href')
        if href and not href.startswith('javascript:'):
            absolute_link = urljoin(url, href)

            # make sure it's on the rbi website only
            if absolute_link.startswith(base_url):
                all_links.append(absolute_link)

                # look on new link for more links
                all_links.extend(get_all_links(absolute_link,visited, max_depth=max_depth, base_url=base_url,depth =depth+1))
    return all_links

def get_pdf_links(url):
    
    response = requests.get(url)
    
    # look only for hrefs that end with .pdf
    parse_only = SoupStrainer('a', href=lambda x: x and x.lower().endswith('.pdf'))
    
    # look through only content in parse_only
    soup = BeautifulSoup(response.content, 'lxml', parse_only=parse_only)
    
    pdf_links = [a['href'] for a in soup.find_all('a', href=True)]

    return pdf_links
 

def download_pdf(url, counter,folder='pdfs'):
    # Create the folder if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)
    response = requests.get(url)
    # Extract filename from the URL
    filename = url.split('/')[-1]
    filepath = os.path.join(folder, filename)
    counter+=1
    # Write the PDF content to a file
    with open(filepath, 'wb') as f:
        f.write(response.content)
    print(f'Downloaded: {filename}')


In [6]:
documentCounter= 0
visited = set()
start = 'https://rbi.org.in/home.aspx'
base = 'https://rbi.org.in'
deepth= 2
all_links = get_all_links(url = start,visited=visited,max_depth=deepth,base_url=base)
print(len(visited))
print(f'Number of links found: {len(visited)}')
all_pdf_links = set()
#go through the links for pdf links
for link in visited:
    pdf_links = get_pdf_links(link)
    for pdf_link in pdf_links:
        if not pdf_link.startswith('http'):# to make sure its an absolute link
            pdf_link = urljoin(link, pdf_link)
        if pdf_link in all_pdf_links: # if we've already come across it, skip
            continue
        all_pdf_links.add(pdf_link)
        #download_pdf(pdf_link,documentCounter)



182
Number of links found: 182


In [7]:
print(f'Number of pdfs found:{len(all_pdf_links)}')

# to write all pdf links to a text file
output_file_path = f'pdf_links_depth_{deepth}.txt' # file name

with open(output_file_path, 'w') as file:
    for pdf_link in all_pdf_links:
        file.write(pdf_link + '\n')

print(f'PDF links have been written to: {output_file_path}')


Number of pdfs found:581
PDF links have been written to: pdf_links_depth_2.txt
