Modular script for downloading a list of large PDFs and then searching for keywords. Hits are returned with document and page number, as well as a text snippet for context.

In [None]:
# Install necessary packages
%pip install requests
%pip install tqdm
%pip install PyMuPDF
%pip install ipywidgets


from IPython.display import display
import ipywidgets as widgets
import fitz  # PyMuPDF
import os
import requests
from tqdm import tqdm

In [None]:

# List of PDF URLs
pdf_urls = [
    "https://www._1.pdf",
    "https://www._2.pdf",
    "https://www._n.pdf"
]

def download_pdf(url):
    response = requests.get(url)
    if response.status_code == 200:
        file_path = url.split("/")[-1].split("?")[0]  # Handle URL parameters
        with open(file_path, 'wb') as f:
            f.write(response.content)
        return file_path
    else:
        print(f"Failed to download {url}")
        return None

def main():
    for pdf_url in tqdm(pdf_urls):
        download_pdf(pdf_url)

main()


In [None]:
import os
import fitz  # PyMuPDF
from IPython.display import display
import ipywidgets as widgets

# List of downloaded PDF filenames
PDF_FILENAMES = [
    "1.pdf",
    "2.pdf",
    "n.pdf"
]

def search_pdf(file_path, search_word):
    """
    Search for a specific word in a PDF file and return the results.
    
    Args:
        file_path (str): Path to the PDF file.
        search_word (str): Word to search for in the PDF.
    
    Returns:
        list: A list of tuples containing file path, page number, and text snippet.
    """
    results = []
    search_word_lower = search_word.lower()
    try:
        with fitz.open(file_path) as doc:
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text = page.get_text("text")
                start = 0
                while True:
                    start = text.lower().find(search_word_lower, start)
                    if start == -1:
                        break
                    snippet = text[max(start - 100, 0):min(start + len(search_word) + 100, len(text))]
                    results.append((file_path, page_num + 1, snippet))
                    start += len(search_word)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return results

def search_and_display_results(search_word):
    """
    Search for a word across all PDFs and display the results.
    
    Args:
        search_word (str): Word to search for in the PDFs.
    """
    if not search_word.strip():
        print("Please enter a search term.")
        return
    
    found_any = False
    for pdf_file in PDF_FILENAMES:
        if os.path.exists(pdf_file):
            print(f"Searching in {pdf_file}...")
            results = search_pdf(pdf_file, search_word)
            if results:
                found_any = True
                for result in results:
                    print(f"Found '{search_word}' in {result[0]} on page {result[1]}:\n{result[2]}\n")
            else:
                print(f"'{search_word}' not found in {pdf_file}.")
        else:
            print(f"File {pdf_file} does not exist.")
    
    if not found_any:
        print(f"'{search_word}' not found in any document.")

def on_search_button_clicked(_):
    """
    Callback function for the search button click event.
    """
    search_word = search_word_input.value.strip()
    search_and_display_results(search_word)

# Create widgets for user input and search action
search_word_input = widgets.Text(
    value='',
    placeholder='Type something',
    description='Search word:',
    disabled=False
)

search_button = widgets.Button(
    description='Search',
    tooltip='Click to search',
    icon='search'
)

# Bind the search button to the callback function
search_button.on_click(on_search_button_clicked)

# Display the widgets for user interaction
display(search_word_input, search_button)
