In [2]:
import re
import pdfplumber
from pathlib import Path
from tqdm.notebook import tqdm  # Progress bars
import pandas as pd

# Configuration
RAW_DATA_DIR = Path("data/raw") 
CLEANED_DIR = Path("data/cleaned")
ERROR_LOG = "cleaning_errors.csv"

# --- Cleaning Functions ---
def clean_text(text):
    """Applies all cleaning rules to extracted text"""
    # Fix hyphenated line breaks
    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text, flags=re.UNICODE)
    # Remove common PDF artifacts
    text = re.sub(r"Page\s*\d+\s*of\s*\d+", "", text, flags=re.IGNORECASE)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

def process_pdf(pdf_path):
    """Extracts and cleans text from a single PDF"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            pages = [clean_text(page.extract_text(layout=True)) 
                   for page in pdf.pages if page.extract_text()]
        return " ".join(pages), None
    except Exception as e:
        return None, str(e)

# --- Directory Structure Setup ---
def mirror_directory_structure(base_dir, target_dir):
    """Creates matching subdirectories in target location"""
    for path in base_dir.rglob("*"):
        if path.is_dir():
            relative = path.relative_to(base_dir)
            (target_dir / relative).mkdir(parents=True, exist_ok=True)

# --- Main Processing ---
def process_all_pdfs():
    errors = []
    mirror_directory_structure(RAW_DATA_DIR, CLEANED_DIR)
    
    pdf_paths = list(RAW_DATA_DIR.rglob("*.pdf"))
    print(f"Found {len(pdf_paths)} PDFs to process")
    
    for pdf_path in tqdm(pdf_paths, desc="Processing PDFs"):
        text, error = process_pdf(pdf_path)
        relative_path = pdf_path.relative_to(RAW_DATA_DIR)
        
        if text:
            # Save cleaned text
            output_path = CLEANED_DIR / relative_path.with_suffix(".txt")
            output_path.write_text(text, encoding="utf-8")
        elif error:
            errors.append({
                "file": str(relative_path),
                "error": error
            })
    
    # Save error log
    if errors:
        pd.DataFrame(errors).to_csv(ERROR_LOG, index=False)
        print(f"Encountered {len(errors)} errors - see {ERROR_LOG}")

# Execute
process_all_pdfs()
print("Cleaning complete! Check data/cleaned/ for results.")

Found 272 PDFs to process


Processing PDFs:   0%|          | 0/272 [00:00<?, ?it/s]

Cleaning complete! Check data/cleaned/ for results.


In [12]:
from pathlib import Path
import random

# Get 3 random cleaned files
cleaned_files = list(Path("data/cleaned").rglob("*.txt"))
samples = random.sample(cleaned_files, 25)

for file in samples:
    print(f"=== {file.name} ===")
    print(file.read_text()[:200] + "...")  # First 200 chars
    print("\n")

print(f"Total cleaned files: {len(cleaned_files)}")
avg_length = sum(len(f.read_text()) for f in cleaned_files)/len(cleaned_files)
print(f"Avg. chars per file: {avg_length:.0f}")

=== 2324schoolaid.txt ===
DESCRIPTIONOF 2023-24 NEWYORKSTATEEXECUTIVEBUDGETRECOMMENDATIONSFORELEMENTARYANDSECONDARYEDUCATIONEDUCATIONUNITNEWYORKSTATEDIVISIONOFTHEBUDGETFebruary 1, 2023 INTRODUCTIONThis report provides a summar...


=== PEO Enterprise Procurement Forecast Report_16 JAN 2025 FINAL.txt ===
PEOEnterprise Procurement Forecast, Jan. 16, 2025 Click here to request a meeting with PEOEnterprise personnel. Any Period of Performance longer than 12 months includes options. DISCLAIMER: The inform...


=== gao-25-107003-highlights.txt ===
December 2024 DODACQUISITIONREFORMMilitary Departments Should Take Steps to Facilitate Speed and Innovation Highlights of GAO-25-107003, a report to congressional committees Why GAODid This Study What...


=== ESSER-Grantee-Consolidated-Monitoring-Protocols-1.txt ===
Internal Deliberative ESSERProgram Fiscal and Program Requirement Domains – Consolidated Monitoring A. State Context CARESAct Section 18003 Description: Congress set aside approximatel

In [6]:
import re
import os
from pathlib import Path

def clean_text(text):
    """Enhanced cleaning with document structure awareness"""
    # Phase 1: Remove document metadata patterns
    patterns_to_remove = [
        # Header/footer patterns
        r'^=== .*? ===\s*',  
        r'^OFFICE OF .*?$\n',
        r'^Department of .*?$\n',
        
        # Author patterns (expanded list)
        r'^(?:Authored?|Written|Prepared|Reported|Compiled) by .*?$\n',
        r'^Contributors?:.*?$\n',
        r'^(?:The )?Authors?:?.*?$\n',
        r'^By [A-Z][a-z]+ [A-Z][a-z]+(?:,? (?:and|&) [A-Z][a-z]+ [A-Z][a-z]+)*$\n',
        
        # Boilerplate text
        r'Additional Copies.*',
        r'For more information.*',
        r'Distribution Statement.*'
    ]
    
    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE|re.MULTILINE)
    
    # Phase 2: Clean residual formatting
    text = re.sub(r'\n{3,}', '\n\n', text)  # Reduce excessive newlines
    text = re.sub(r'[^\S\n]{2,}', ' ', text)  # Fix multiple spaces
    text = text.strip()
    
    return text

def process_category_directory(base_dir):
    """Process all files in data/cleaned/[category]/[subcategory] structure"""
    base_path = Path(base_dir)
    total_files = 0
    
    for category in base_path.iterdir():
        if not category.is_dir():
            continue
            
        for subcategory in category.iterdir():
            if not subcategory.is_dir():
                continue
                
            for file in subcategory.glob('*.txt'):
                total_files += 1
                with open(file, 'r', encoding='utf-8', errors='replace') as f:
                    original = f.read()
                
                cleaned = clean_text(original)
                
                # Overwrite with cleaned version (or use output_dir to preserve originals)
                with open(file, 'w', encoding='utf-8') as f:
                    f.write(cleaned)
    
    return total_files

if __name__ == '__main__':
    input_base = 'data/cleaned'  # Your exact structure
    processed_count = process_category_directory(input_base)
    
    print(f"Successfully cleaned {processed_count} files")
    print("Structure preserved in:")
    print(f"  {input_base}/[category]/[subcategory]/*.txt")

Successfully cleaned 269 files
Structure preserved in:
  data/cleaned/[category]/[subcategory]/*.txt


In [11]:
import re
from pathlib import Path

def clean_gov_document(text):
    """Focused text cleaner (removes authors/boilerplate only)"""
    # 1. Remove file header markers (=== filename ===)
    text = re.sub(r'^=== .*? ===\s*', '', text, flags=re.MULTILINE)
    
    # 2. Remove author/editor metadata
    patterns = [
        (r'Chief Editor.*?(\n\n|\Z)', '', re.DOTALL|re.IGNORECASE),
        (r'Editors?.*?(\n\n|\Z)', '', re.DOTALL|re.IGNORECASE),
        (r'Prepared by.*?(\n\n|\Z)', '', re.DOTALL|re.IGNORECASE),
        (r'Authors?:.*?(\n\n|\Z)', '', re.DOTALL|re.IGNORECASE)
    ]
    for pattern, repl, flags in patterns:
        text = re.sub(pattern, repl, text, flags=flags)
    
    # 3. Remove specific boilerplate
    text = re.sub(r'Briefings on How To Use the Federal Register.*', '', text, flags=re.DOTALL)
    text = re.sub(r'Form Approved\s+.*?OMB No\. \d+-\d+.*?\n', '', text)
    
    # 4. Fix formatting
    text = re.sub(r'([A-Z])\s+([A-Z])', r'\1\2', text)  # Spaced-out caps
    text = re.sub(r'\n{3,}', '\n\n', text)  # Excessive newlines
    
    return text.strip()

def clean_files(root_dir):
    """Cleans all .txt files in place without moving them"""
    for filepath in Path(root_dir).rglob('*.txt'):
        try:
            original = filepath.read_text(encoding='utf-8', errors='replace')
            cleaned = clean_gov_document(original)
            filepath.write_text(cleaned, encoding='utf-8')
            print(f"✔ Cleaned {filepath.relative_to(root_dir)}")
        except Exception as e:
            print(f"❌ Failed {filepath.relative_to(root_dir)}: {str(e)}")

if __name__ == '__main__':
    input_dir = "data/cleaned"  # Your directory structure
    print(f"🧹 Cleaning files in {input_dir}...")
    clean_files(input_dir)
    print("✅ Done! Files were cleaned IN PLACE.")

🧹 Cleaning files in data/cleaned...
✔ Cleaned healthcare/FDA/Updated.DSC-Hydroxychloroquine.chloroquine.txt
✔ Cleaned healthcare/FDA/24740676_FAR.txt
✔ Cleaned healthcare/FDA/Acceptability-of-Draft-Labeling-to-Support-Abbreviated-New-Drug-Application-Approval--Guidance-for-Industry.txt
✔ Cleaned healthcare/FDA/1332.txt
✔ Cleaned healthcare/FDA/CDRH_International_Harmonization_Draft_Strategic_Plan_2023.txt
✔ Cleaned healthcare/FDA/40244414fnl.txt
✔ Cleaned healthcare/FDA/December-8-2023-Approval-Letter-LYFGENIA.txt
✔ Cleaned healthcare/FDA/December 16, 2021 Approval Letter - Comirnaty.txt
✔ Cleaned healthcare/FDA/2023 ODP Health Alert -FDA Drug Safety Communication.txt
✔ Cleaned healthcare/FDA/dsc_prolia_risk_of_severe_hypocalcemia_for_web.txt
✔ Cleaned healthcare/FDA/FDA-Drug-Safety-Communication.txt
✔ Cleaned healthcare/FDA/FDA-Drug-Safety-Communication--Fluoroquinolone.txt
✔ Cleaned healthcare/FDA/Guidance-Medical-Xray-IEC.txt
✔ Cleaned healthcare/FDA/US-FDA-Artificial-Intelligence-a