In [2]:
import re
import pdfplumber
from pathlib import Path
from tqdm.notebook import tqdm  # Progress bars
import pandas as pd

# Configuration
RAW_DATA_DIR = Path("data/raw") 
CLEANED_DIR = Path("data/cleaned")
ERROR_LOG = "cleaning_errors.csv"

# --- Cleaning Functions ---
def clean_text(text):
    """Applies all cleaning rules to extracted text"""
    # Fix hyphenated line breaks
    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text, flags=re.UNICODE)
    # Remove common PDF artifacts
    text = re.sub(r"Page\s*\d+\s*of\s*\d+", "", text, flags=re.IGNORECASE)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

def process_pdf(pdf_path):
    """Extracts and cleans text from a single PDF"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            pages = [clean_text(page.extract_text(layout=True)) 
                   for page in pdf.pages if page.extract_text()]
        return " ".join(pages), None
    except Exception as e:
        return None, str(e)

# --- Directory Structure Setup ---
def mirror_directory_structure(base_dir, target_dir):
    """Creates matching subdirectories in target location"""
    for path in base_dir.rglob("*"):
        if path.is_dir():
            relative = path.relative_to(base_dir)
            (target_dir / relative).mkdir(parents=True, exist_ok=True)

# --- Main Processing ---
def process_all_pdfs():
    errors = []
    mirror_directory_structure(RAW_DATA_DIR, CLEANED_DIR)
    
    pdf_paths = list(RAW_DATA_DIR.rglob("*.pdf"))
    print(f"Found {len(pdf_paths)} PDFs to process")
    
    for pdf_path in tqdm(pdf_paths, desc="Processing PDFs"):
        text, error = process_pdf(pdf_path)
        relative_path = pdf_path.relative_to(RAW_DATA_DIR)
        
        if text:
            # Save cleaned text
            output_path = CLEANED_DIR / relative_path.with_suffix(".txt")
            output_path.write_text(text, encoding="utf-8")
        elif error:
            errors.append({
                "file": str(relative_path),
                "error": error
            })
    
    # Save error log
    if errors:
        pd.DataFrame(errors).to_csv(ERROR_LOG, index=False)
        print(f"Encountered {len(errors)} errors - see {ERROR_LOG}")

# Execute
process_all_pdfs()
print("Cleaning complete! Check data/cleaned/ for results.")

Found 272 PDFs to process


Processing PDFs:   0%|          | 0/272 [00:00<?, ?it/s]

Cleaning complete! Check data/cleaned/ for results.


In [None]:
from pathlib import Path
import random

# Get 3 random cleaned files
cleaned_files = list(Path("data/cleaned").rglob("*.txt"))
samples = random.sample(cleaned_files, 3)

for file in samples:
    print(f"=== {file.name} ===")
    print(file.read_text()[:200] + "...")  # First 200 chars
    print("\n")

print(f"Total cleaned files: {len(cleaned_files)}")
avg_length = sum(len(f.read_text()) for f in cleaned_files)/len(cleaned_files)
print(f"Avg. chars per file: {avg_length:.0f}")

=== 2023-073-R-AssistantDDofFBC&P.txt ===
POSTING #: ISSUE DATE: CLOSING DATE: 2023 - 073 - R July 24, 2023 September 25, 2023 TITLE: OPEN TO: FUNCTIONAL TITLE: Assistant Division Director RANGE: General Public WORKWEEK: Assistant Director of...


=== R107MCM.txt ===
CMS Manual System Department of Health & Human Services (DHHS) Pub. 100-16 Medicare Managed Care Centers for Medicare & Medicaid Services (CMS) Transmittal 107 Date: June 22, 2012 SUBJECT: Chapter 4, ...


=== ED603154.txt ===
H.R. 4674, the College Affordability Act: Proposed Reauthorization of the Higher Education Act, Summary of Major Provisions January 7, 2020 Congressional Research Service https://crsreports.congress.g...


