In [2]:
import re
import pdfplumber
from pathlib import Path
from tqdm.notebook import tqdm  # Progress bars
import pandas as pd

# Configuration
RAW_DATA_DIR = Path("data/raw") 
CLEANED_DIR = Path("data/cleaned")
ERROR_LOG = "cleaning_errors.csv"

# --- Cleaning Functions ---
def clean_text(text):
    """Applies all cleaning rules to extracted text"""
    # Fix hyphenated line breaks
    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text, flags=re.UNICODE)
    # Remove common PDF artifacts
    text = re.sub(r"Page\s*\d+\s*of\s*\d+", "", text, flags=re.IGNORECASE)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

def process_pdf(pdf_path):
    """Extracts and cleans text from a single PDF"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            pages = [clean_text(page.extract_text(layout=True)) 
                   for page in pdf.pages if page.extract_text()]
        return " ".join(pages), None
    except Exception as e:
        return None, str(e)

# --- Directory Structure Setup ---
def mirror_directory_structure(base_dir, target_dir):
    """Creates matching subdirectories in target location"""
    for path in base_dir.rglob("*"):
        if path.is_dir():
            relative = path.relative_to(base_dir)
            (target_dir / relative).mkdir(parents=True, exist_ok=True)

# --- Main Processing ---
def process_all_pdfs():
    errors = []
    mirror_directory_structure(RAW_DATA_DIR, CLEANED_DIR)
    
    pdf_paths = list(RAW_DATA_DIR.rglob("*.pdf"))
    print(f"Found {len(pdf_paths)} PDFs to process")
    
    for pdf_path in tqdm(pdf_paths, desc="Processing PDFs"):
        text, error = process_pdf(pdf_path)
        relative_path = pdf_path.relative_to(RAW_DATA_DIR)
        
        if text:
            # Save cleaned text
            output_path = CLEANED_DIR / relative_path.with_suffix(".txt")
            output_path.write_text(text, encoding="utf-8")
        elif error:
            errors.append({
                "file": str(relative_path),
                "error": error
            })
    
    # Save error log
    if errors:
        pd.DataFrame(errors).to_csv(ERROR_LOG, index=False)
        print(f"Encountered {len(errors)} errors - see {ERROR_LOG}")

# Execute
process_all_pdfs()
print("Cleaning complete! Check data/cleaned/ for results.")

Found 272 PDFs to process


Processing PDFs:   0%|          | 0/272 [00:00<?, ?it/s]

Cleaning complete! Check data/cleaned/ for results.


In [19]:
from pathlib import Path
import random

# Get 3 random cleaned files
cleaned_files = list(Path("data/cleaned").rglob("*.txt"))
samples = random.sample(cleaned_files, 25)

for file in samples:
    print(f"=== {file.name} ===")
    print(file.read_text()[:200] + "...")  # First 200 chars
    print("\n")

print(f"Total cleaned files: {len(cleaned_files)}")
avg_length = sum(len(f.read_text()) for f in cleaned_files)/len(cleaned_files)
print(f"Avg. chars per file: {avg_length:.0f}")

=== Medicaid Annual Report - 2024.txt ===
Department of Health and Hu December 1, 2024 The Honorable Jim Pillen Governor of Nebraska P.O. Box 94848 Lincoln, NE 68509 Mr. Brandon Metzler Clerk of the Legislature P.O. Box 94604 Lincoln, NE 6850...


=== R87MCM.txt ===
CMSManual System Department of Health & Human Services (DHHS) Pub. 100-16 Medicare Managed Care Centers for Medicare & Medicaid Services (CMS) Transmittal 87 Date: JUNE 8, 2007 SUBJECT: Update of Chap...


=== draft-exhibits-part21.txt ===
Annual Financial Report (AFR) -2017 Other Assets Latest Fiscal Year End Previous Fiscal Year End Deposits $0.00 Other Prepaid Expenses $0.00 Goodwill $0.00 Revolving Book Account $0.00 SAMatching Fund...


=== Budget-in-Brief 2024.txt ===
VILLAGEOFWOODRIDGEBUDGET-IN-BRIEFFISCALYEAR 2024 MAYORALetter from the Mayor Gina Cunningham Village of Woodridge Residents and Businesses: BOARDOFTRUSTEES Jennifer Anteliz It is my pleasure to pres...


=== p334.txt ===
...


=== mc86c04.txt ===
Medic

In [6]:
import re
import os
from pathlib import Path

def clean_text(text):
    """Enhanced cleaning with document structure awareness"""
    # Phase 1: Remove document metadata patterns
    patterns_to_remove = [
        # Header/footer patterns
        r'^=== .*? ===\s*',  
        r'^OFFICE OF .*?$\n',
        r'^Department of .*?$\n',
        
        # Author patterns (expanded list)
        r'^(?:Authored?|Written|Prepared|Reported|Compiled) by .*?$\n',
        r'^Contributors?:.*?$\n',
        r'^(?:The )?Authors?:?.*?$\n',
        r'^By [A-Z][a-z]+ [A-Z][a-z]+(?:,? (?:and|&) [A-Z][a-z]+ [A-Z][a-z]+)*$\n',
        
        # Boilerplate text
        r'Additional Copies.*',
        r'For more information.*',
        r'Distribution Statement.*'
    ]
    
    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE|re.MULTILINE)
    
    # Phase 2: Clean residual formatting
    text = re.sub(r'\n{3,}', '\n\n', text)  # Reduce excessive newlines
    text = re.sub(r'[^\S\n]{2,}', ' ', text)  # Fix multiple spaces
    text = text.strip()
    
    return text

def process_category_directory(base_dir):
    """Process all files in data/cleaned/[category]/[subcategory] structure"""
    base_path = Path(base_dir)
    total_files = 0
    
    for category in base_path.iterdir():
        if not category.is_dir():
            continue
            
        for subcategory in category.iterdir():
            if not subcategory.is_dir():
                continue
                
            for file in subcategory.glob('*.txt'):
                total_files += 1
                with open(file, 'r', encoding='utf-8', errors='replace') as f:
                    original = f.read()
                
                cleaned = clean_text(original)
                
                # Overwrite with cleaned version (or use output_dir to preserve originals)
                with open(file, 'w', encoding='utf-8') as f:
                    f.write(cleaned)
    
    return total_files

if __name__ == '__main__':
    input_base = 'data/cleaned'  # Your exact structure
    processed_count = process_category_directory(input_base)
    
    print(f"Successfully cleaned {processed_count} files")
    print("Structure preserved in:")
    print(f"  {input_base}/[category]/[subcategory]/*.txt")

Successfully cleaned 269 files
Structure preserved in:
  data/cleaned/[category]/[subcategory]/*.txt


In [15]:
import re
from pathlib import Path
import pandas as pd

# Configuration - Only edit these patterns if absolutely needed
PATTERNS = {
    'tax_forms': r'Userid: .*?cycle\d+/source.*?(\n|$)',  # IRS/CMS system metadata
    'corrupted_caps': r'([A-Z])\1{2,}',  # Fix "EEXXEECCUUTTIIVVEE" → "EXECUTIVE"
    'non_gov': [  # Phrases that indicate non-government content
        r'opinion piece',
        r'charitable giving',
        r'by [A-Z][a-z]+ [A-Z][a-z]+, [A-Za-z ]+ Foundation'
    ]
}

def final_clean(text, filepath):
    """One-time aggressive cleaning with validation"""
    # Step 1: Remove known bad patterns
    text = re.sub(PATTERNS['tax_forms'], '', text, flags=re.DOTALL|re.IGNORECASE)
    text = re.sub(PATTERNS['corrupted_caps'], r'\1', text)
    
    # Step 2: Flag (not delete) non-government content
    is_gov = not any(re.search(p, text, re.I) for p in PATTERNS['non_gov'])
    
    # Step 3: Final normalization
    text = re.sub(r'\n{3,}', '\n\n', text).strip()
    return text, is_gov

def main(input_dir="data/cleaned"):
    """Run final clean with validation report"""
    stats = []
    for filepath in Path(input_dir).rglob("*.txt"):
        original = filepath.read_text(encoding='utf-8', errors='replace')
        cleaned, is_gov = final_clean(original, filepath)
        
        stats.append({
            'file': str(filepath),
            'original_lines': len(original.splitlines()),
            'cleaned_lines': len(cleaned.splitlines()),
            'is_government': is_gov,
            'issues_fixed': "tax_forms" if "Userid: CPMSchema" in original else None
        })
        
        filepath.write_text(cleaned)
    
    # Generate validation report
    df = pd.DataFrame(stats)
    report_path = Path("cleaning_report.csv")
    df.to_csv(report_path, index=False)
    
    print(f"""
    ✅ FINAL CLEANING COMPLETE
    =========================
    Files processed: {len(df)}
    Government documents: {df['is_government'].sum()}
    Files with tax metadata fixed: {sum(df['issues_fixed'] == "tax_forms")}
    
    Full report saved to: {report_path}
    
    NEXT STEPS:
    1. Review 'is_government=False' files in the report
    2. Delete any remaining non-gov files manually
    3. Proceed to model training
    """)

if __name__ == '__main__':
    main()


    ✅ FINAL CLEANING COMPLETE
    Files processed: 269
    Government documents: 268
    Files with tax metadata fixed: 9
    
    Full report saved to: cleaning_report.csv
    
    NEXT STEPS:
    1. Review 'is_government=False' files in the report
    2. Delete any remaining non-gov files manually
    3. Proceed to model training
    


In [20]:
import pandas as pd
from pathlib import Path

data = []
for file in Path("data/cleaned").rglob("*.txt"):
    category = file.parent.parent.name  # healthcare
    subcategory = file.parent.name      # medicaid
    text = file.read_text(encoding='utf-8').strip()
    
    data.append({
        "text": text,
        "category": category,
        "subcategory": subcategory,
        "source_file": file.name
    })

pd.DataFrame(data).to_csv("gov_docs.csv", index=False)
print("✅ CSV created with", len(data), "documents")

✅ CSV created with 269 documents


In [21]:
import pandas as pd
df = pd.read_csv("gov_docs.csv")
print(df.sample(3))  # Spot-check entries

                                                  text   category  \
224  FCritical Manufacturing Cybersecurity Framewor...    defense   
133  Revenues and Expenditures for Public FINANCETA...  education   
106  Glossary CFRDCLCost of Attendance 2 CHAPTER (B...  education   

               subcategory                                        source_file  
224          cybersecurity  critical-manufacturing-framework-implementatio...  
133             k12funding                                        2022301.txt  
106  highereducationpolicy  2019-2020 Chapter 2 - Cost of Attendance (Budg...  


In [None]:
import pandas as pd
df = pd.read_csv("gov_docs.csv")
hierarchy_issues = df.groupby("subcategory")["category"].nunique() > 1
print(hierarchy_issues[hierarchy_issues].index.tolist())  # Bad subcategories

[]


: 