In [None]:
import os

def setup_download_folder(folder_name="mlk_pdfs"):
    """
    Creates the specified download folder if it doesn't exist.
    """
    if not os.path.exists(folder_name):
        try:
            os.makedirs(folder_name)
            print(f"SUCCESS: Created download folder: '{folder_name}'")
        except OSError as e:
            print(f"ERROR: Could not create folder '{folder_name}': {e}")
            return False
    else:
        print(f"INFO: Folder '{folder_name}' already exists.")
    
    # Verify if it's actually a directory
    if os.path.isdir(folder_name):
        print(f"SUCCESS: '{folder_name}' is a valid directory.")
        return True
    else:
        print(f"ERROR: '{folder_name}' exists but is not a directory.")
        return False

if __name__ == "__main__":
    # Define the name of the folder where PDFs will be stored
    pdf_download_folder = "mlk_pdfs"
    
    # Run the setup function
    folder_ready = setup_download_folder(pdf_download_folder)

    if folder_ready:
        print("\nFolder setup complete. You can proceed to the next part.")
    else:
        print("\nFolder setup failed. Please resolve the errors before proceeding.")

In [None]:
import requests
import os # Keep os imported for potential future use or context

def fetch_page_content(url):
    """
    Fetches the HTML content of a given URL.
    Returns the content as a string if successful, None otherwise.
    """
    print(f"INFO: Attempting to fetch content from: {url}")
    try:
        response = requests.get(url, timeout=10) # Added a timeout for robustness
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        print(f"SUCCESS: Successfully fetched content from {url}")
        # Return a snippet of the content to confirm it's not empty, but don't print all
        print(f"DEBUG: First 200 characters of page content: {response.text[:200]}...")
        return response.text
    except requests.exceptions.HTTPError as errh:
        print(f"ERROR: HTTP Error occurred: {errh}")
    except requests.exceptions.ConnectionError as errc:
        print(f"ERROR: Connection Error occurred: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"ERROR: Timeout Error occurred: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"ERROR: An unexpected Request Error occurred: {err}")
    return None

if __name__ == "__main__":
    # The URL for the MLK assassination records page
    mlk_records_url = "https://www.archives.gov/research/mlk"
    
    # Fetch the content
    page_html_content = fetch_page_content(mlk_records_url)

    if page_html_content:
        print("\nWeb page content successfully retrieved. You can proceed to the next part.")
    else:
        print("\nFailed to retrieve web page content. Please review the error messages.")

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse # urlparse is useful for domain checking
import os # Just in case for path context, though not directly used in this chunk

def fetch_page_content(url):
    """
    Fetches the HTML content of a given URL.
    (This function is repeated here for self-containment,
    but in a final script, it would be combined with the main logic.)
    """
    print(f"INFO: Attempting to fetch content from: {url}")
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        print(f"SUCCESS: Successfully fetched content from {url}")
        return response.text
    except requests.exceptions.RequestException as err:
        print(f"ERROR: Failed to fetch content from {url}: {err}")
    return None

def extract_pdf_links(html_content, base_url, limit=100):
    """
    Parses HTML content to find and extract unique PDF links.
    Filters to keep links primarily from the same domain.
    """
    if not html_content:
        print("ERROR: No HTML content provided to extract links from.")
        return set()

    print("INFO: Starting to parse HTML for PDF links...")
    soup = BeautifulSoup(html_content, 'html.parser')
    pdf_links = set() # Use a set to store unique URLs

    parsed_base_url = urlparse(base_url)
    base_domain = parsed_base_url.netloc

    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        full_url = urljoin(base_url, href)

        # Basic check: Is it a PDF link?
        if full_url.lower().endswith('.pdf'):
            # More robust check: Is it from the same domain or a sub-domain?
            # This helps prevent downloading PDFs from external sites linked on the page.
            parsed_full_url = urlparse(full_url)
            if parsed_full_url.netloc == base_domain or parsed_full_url.netloc.endswith(f".{base_domain}"):
                pdf_links.add(full_url)
                if len(pdf_links) >= limit:
                    print(f"INFO: Reached limit of {limit} unique PDF links during parsing. Stopping link collection.")
                    break
    
    print(f"SUCCESS: Found {len(pdf_links)} unique PDF links.")
    return pdf_links

if __name__ == "__main__":
    mlk_records_url = "https://www.archives.gov/research/mlk"
    
    # First, fetch the content
    page_html = fetch_page_content(mlk_records_url)

    if page_html:
        # Then, extract the PDF links
        found_pdf_urls = extract_pdf_links(page_html, mlk_records_url, limit=100)
        
        print("\n--- Summary of Found PDF Links (First 5) ---")
        if found_pdf_urls:
            for i, link in enumerate(list(found_pdf_urls)[:5]):
                print(f"{i+1}. {link}")
            if len(found_pdf_urls) > 5:
                print(f"...and {len(found_pdf_urls) - 5} more.")
        else:
            print("No PDF links found on the page.")
        
        print("\nPDF link extraction complete. You can proceed to the next part (downloading).")
    else:
        print("\nCannot extract links without page content. Please review previous errors.")

In [None]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse

# --- Function from Part 1: Folder Setup ---
def setup_download_folder(folder_name="mlk_pdfs"):
    """
    Creates the specified download folder if it doesn't exist.
    """
    if not os.path.exists(folder_name):
        try:
            os.makedirs(folder_name)
            print(f"SUCCESS: Created download folder: '{folder_name}'")
        except OSError as e:
            print(f"ERROR: Could not create folder '{folder_name}': {e}")
            return False
    else:
        print(f"INFO: Folder '{folder_name}' already exists.")
    
    if os.path.isdir(folder_name):
        print(f"SUCCESS: '{folder_name}' is a valid directory.")
        return True
    else:
        print(f"ERROR: '{folder_name}' exists but is not a directory.")
        return False

# --- Function from Part 2: Fetch Web Page Content ---
def fetch_page_content(url):
    """
    Fetches the HTML content of a given URL.
    Returns the content as a string if successful, None otherwise.
    """
    print(f"INFO: Attempting to fetch content from: {url}")
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        print(f"SUCCESS: Successfully fetched content from {url}")
        return response.text
    except requests.exceptions.RequestException as err:
        print(f"ERROR: Failed to fetch content from {url}: {err}")
    return None

# --- Function from Part 3: Extract PDF Links ---
def extract_pdf_links(html_content, base_url, limit=100):
    """
    Parses HTML content to find and extract unique PDF links.
    Filters to keep links primarily from the same domain.
    """
    if not html_content:
        print("ERROR: No HTML content provided to extract links from.")
        return set()

    print("INFO: Starting to parse HTML for PDF links...")
    soup = BeautifulSoup(html_content, 'html.parser')
    pdf_links = set()

    parsed_base_url = urlparse(base_url)
    base_domain = parsed_base_url.netloc

    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        full_url = urljoin(base_url, href)

        if full_url.lower().endswith('.pdf'):
            parsed_full_url = urlparse(full_url)
            if parsed_full_url.netloc == base_domain or parsed_full_url.netloc.endswith(f".{base_domain}"):
                pdf_links.add(full_url)
                if len(pdf_links) >= limit:
                    print(f"INFO: Reached limit of {limit} unique PDF links during parsing. Stopping link collection.")
                    break
    
    print(f"SUCCESS: Found {len(pdf_links)} unique PDF links.")
    return pdf_links

# --- NEW Function for Part 4: Download PDFs ---
def download_pdfs(pdf_urls, download_folder, limit=100):
    """
    Downloads PDF files from a list of URLs into the specified folder.
    """
    if not pdf_urls:
        print("INFO: No PDF URLs provided for download.")
        return

    print(f"\nINFO: Starting to download {min(len(pdf_urls), limit)} PDFs...")
    downloaded_count = 0

    # Ensure we only try to download up to the limit
    for pdf_url in list(pdf_urls)[:limit]:
        if downloaded_count >= limit:
            break # Just in case, double-check limit

        file_name = os.path.basename(urlparse(pdf_url).path)
        # Clean up file name (e.g., remove query parameters if any)
        file_name = file_name.split('?')[0]
        if not file_name: # Fallback if filename is empty after split
            file_name = "downloaded_pdf_" + str(downloaded_count + 1) + ".pdf"
        
        save_path = os.path.join(download_folder, file_name)

        if os.path.exists(save_path):
            print(f"INFO: Skipping existing file: {file_name}")
            downloaded_count += 1 # Count it as 'handled' even if skipped
            continue

        try:
            print(f"Downloading {downloaded_count + 1}/{limit}: {file_name} from {pdf_url}")
            pdf_response = requests.get(pdf_url, stream=True, timeout=30) # Increased timeout for large files
            pdf_response.raise_for_status()

            with open(save_path, 'wb') as f:
                for chunk in pdf_response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"SUCCESS: Downloaded: {file_name}")
            downloaded_count += 1

        except requests.exceptions.RequestException as e:
            print(f"ERROR: Failed to download {pdf_url}: {e}")
        except Exception as e:
            print(f"ERROR: An unexpected error occurred while processing {pdf_url}: {e}")

    print(f"\nFinished download process. Total PDFs downloaded/skipped: {downloaded_count}/{min(len(pdf_urls), limit)}")


if __name__ == "__main__":
    mlk_records_url = "https://www.archives.gov/research/mlk"
    pdf_download_folder = "mlk_pdfs" # This is the folder created in Part 1
    download_limit = 100 # Adjust this if you want more or fewer than 100 PDFs

    # --- Step 1: Setup Folder ---
    folder_is_ready = setup_download_folder(pdf_download_folder)
    if not folder_is_ready:
        print("Aborting download process due to folder issues.")
        exit() # Stop if folder setup failed

    # --- Step 2: Fetch Page Content ---
    page_html = fetch_page_content(mlk_records_url)
    if not page_html:
        print("Aborting download process as page content could not be retrieved.")
        exit() # Stop if page content fetch failed

    # --- Step 3: Extract PDF Links ---
    found_pdf_urls = extract_pdf_links(page_html, mlk_records_url, limit=download_limit)
    if not found_pdf_urls:
        print("No PDF links found. Aborting download process.")
        exit() # Stop if no links found

    # --- Step 4: Download PDFs ---
    download_pdfs(found_pdf_urls, pdf_download_folder, limit=download_limit)

    print("\n---------------------------------------------------------")
    print("PDF download script finished. Check your 'mlk_pdfs' folder.")
    print("---------------------------------------------------------")

In [None]:
import os
from pypdf import PdfWriter # Corrected: Using PdfWriter as PdfMerger is deprecated

def combine_pdfs_in_folder(input_folder, output_filename="merged_mlk_records.pdf"):
    """
    Combines all PDF files in a given folder into a single PDF.

    Args:
        input_folder (str): The path to the folder containing the PDF files.
        output_filename (str): The name of the output merged PDF file.
    """
    merger = PdfWriter() # Corrected: Initializing PdfWriter
    pdf_files = []

    # Collect all PDF files in the input folder
    print(f"INFO: Scanning '{input_folder}' for PDF files...")
    for root, _, files in os.walk(input_folder):
        for file in files:
            # Ensure we're only adding PDFs and not the output file itself if it already exists
            if file.lower().endswith('.pdf') and file.lower() != output_filename.lower():
                pdf_files.append(os.path.join(root, file))

    # Sort files to ensure consistent merging order (e.g., by name)
    pdf_files.sort()

    if not pdf_files:
        print(f"INFO: No PDF files found in '{input_folder}' to merge. (Make sure your downloaded PDFs are in there!)")
        return

    print(f"INFO: Found {len(pdf_files)} PDFs to merge. Starting merge process...")

    for pdf in pdf_files:
        try:
            merger.append(pdf)
            print(f"INFO: Appended: {os.path.basename(pdf)}")
        except Exception as e:
            print(f"ERROR: Could not append {os.path.basename(pdf)}: {e}")

    try:
        output_path = os.path.join(input_folder, output_filename)
        with open(output_path, "wb") as f:
            merger.write(f)
        print(f"\nSUCCESS: All PDFs merged successfully into: {output_path}")
    except Exception as e:
        print(f"ERROR: Error writing merged PDF: {e}")
    finally:
        merger.close() # Ensure all file handles are closed

if __name__ == "__main__":
    pdf_input_folder = "mlk_pdfs" # This should be the folder where your downloaded PDFs are
    combine_pdfs_in_folder(pdf_input_folder)
    print("\n---------------------------------------------------------")
    print("PDF merging script finished. Check your 'mlk_pdfs' folder for the merged file.")
    print("---------------------------------------------------------")

In [None]:
import ocrmypdf
import os

def ocr_pdf_file(input_pdf_path, output_pdf_path, language='eng'):
    """
    Performs OCR on a PDF file and creates a new searchable PDF.
    Updated for ocrmypdf v16.x.x API.
    """
    if not os.path.exists(input_pdf_path):
        print(f"ERROR: Input PDF not found at {input_pdf_path}")
        return

    print(f"INFO: Starting OCR for {os.path.basename(input_pdf_path)}...")
    print(f"This may take a while, especially for a large file (100 PDFs merged).")
    try:
        ocrmypdf.ocr(
            input_file=input_pdf_path,
            output_file=output_pdf_path,
            language=language,
            force_ocr=True,
            optimize=1,
        )
        print(f"\nSUCCESS: OCR completed successfully! Searchable PDF saved to: {output_pdf_path}")
    except ocrmypdf.exceptions.InputFileError as e:
        print(f"ERROR: OCR error: Input file issue - {e}")
    except Exception as e:
        error_message = str(e).lower()
        if "tesseract" in error_message or "ghostscript" in error_message or "command not found" in error_message:
            print(f"ERROR: OCR error: Tesseract or Ghostscript command failed. This is often due to them not being installed or not in your system's PATH. Please verify their installation. Details: {e}")
        else:
            print(f"ERROR: An unexpected OCR error occurred: {e}")

if __name__ == "__main__":
    pdf_folder = "mlk_pdfs"
    input_merged_pdf = os.path.join(pdf_folder, "merged_mlk_records.pdf")
    output_searchable_pdf = os.path.join(pdf_folder, "searchable_mlk_records.pdf")

    ocr_pdf_file(input_merged_pdf, output_searchable_pdf, language='eng')
    print("\n---------------------------------------------------------")
    print("OCR script finished. Check your 'mlk_pdfs' folder for the searchable PDF.")
    print("---------------------------------------------------------")

In [1]:
!pip freeze > requirements.txt