In [None]:
# import os
# import requests
# from bs4 import BeautifulSoup
# from urllib.parse import urljoin

# # Configuration
# BASE_URL = "https://www.cms.gov/priorities/innovation/evaluation-research-reports"  # Replace with your target URL
# SAVE_DIR = "data/raw/healthcare"
# os.makedirs(SAVE_DIR, exist_ok=True)  # Create directory if missing

# # Custom headers to mimic browser behavior
# HEADERS = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# }

# def sanitize_filename(filename):
#     """Remove invalid characters from filenames"""
#     return "".join(c for c in filename if c.isalnum() or c in (' ', '-', '_')).rstrip()

# def download_pdf(url, save_path):
#     """Download PDF with error handling"""
#     try:
#         response = requests.get(url, headers=HEADERS, timeout=10)
#         response.raise_for_status()  # Raise HTTP errors
#         with open(save_path, 'wb') as f:
#             f.write(response.content)
#         print(f"✅ Saved: {save_path}")
#         return True
#     except Exception as e:
#         print(f"❌ Failed to download {url}: {str(e)}")
#         return False

# # Main scraping function
# def scrape_and_download():
#     print(f"🔍 Scraping {BASE_URL}...")
#     try:
#         response = requests.get(BASE_URL, headers=HEADERS)
#         response.raise_for_status()
#         soup = BeautifulSoup(response.text, 'html.parser')
        
#         pdf_count = 0
#         for link in soup.find_all('a', href=True):
#             href = link['href']
            
#             # Skip non-PDF links
#             if not href.lower().endswith('.pdf'):
#                 continue
                
#             # Construct absolute URL
#             pdf_url = urljoin(BASE_URL, href)
            
#             # Generate filename from link text or URL
#             filename = sanitize_filename(link.text.strip() or pdf_url.split('/')[-1]) + ".pdf"
#             save_path = os.path.join(SAVE_DIR, filename)
            
#             # Download PDF
#             if download_pdf(pdf_url, save_path):
#                 pdf_count += 1
        
#         print(f"🎉 Downloaded {pdf_count} PDFs to {SAVE_DIR}")
        
#     except Exception as e:
#         print(f"⚠️ Scraping failed: {str(e)}")

# if __name__ == "__main__":
#     scrape_and_download()

🔍 Scraping https://www.cms.gov/priorities/innovation/evaluation-research-reports...
✅ Saved: data/raw/healthcare/Marketplace help desk  call centers.pdf
🎉 Downloaded 1 PDFs to data/raw/healthcare


In [16]:
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Configuration - works in notebooks/IDEs/standalone scripts
try:
    BASE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "raw")
except NameError:
    BASE_DIR = os.path.join(os.getcwd(), "data", "raw")  # Fallback for notebooks
CATEGORIES = {
    "healthcare": [
        "https://www.cms.gov/about-cms/agency-information/history",
        "https://www.medicaid.gov/about-us/reports-and-evaluations/index.html"
    ],
    "defense": [
        "https://www.defense.gov/News/Publications/",
        "https://media.defense.gov/Publications/"
    ]
}
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
DELAY = 2  # Conservative delay for .gov sites

def setup_project_folders():
    """Create ./data/raw/[category] structure in current project"""
    for category in CATEGORIES:
        category_path = os.path.join(BASE_DIR, category)
        os.makedirs(category_path, exist_ok=True)
        print(f"📁 Created: {os.path.relpath(category_path)}")

def is_gov_pdf(url):
    """Enhanced PDF detection for government sites"""
    if not url:
        return False
    url_lower = url.lower()
    return (
        url_lower.endswith('.pdf') or
        '/pdf' in url_lower or
        any(x in url_lower for x in [
            'download=pdf',
            'type=pdf',
            '/document_library/',
            '/publications/',
            'file=pdf'
        ])
    )

def download_gov_pdf(url, save_path):
    """Government-specific downloader with robust handling"""
    try:
        response = requests.get(
            url,
            headers=HEADERS,
            stream=True,
            verify=False,
            timeout=30
        )
        response.raise_for_status()

        # Handle Content-Disposition filenames
        if 'Content-Disposition' in response.headers:
            filename = re.findall(
                'filename="?([^"]+)"?',
                response.headers['Content-Disposition']
            )[0]
            save_path = os.path.join(os.path.dirname(save_path), filename)

        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(8192):
                f.write(chunk)
        
        print(f"✅ Saved: {os.path.relpath(save_path)}")
        return True

    except Exception as e:
        print(f"❌ Failed to download {os.path.basename(url)}: {str(e)}")
        return False

def scrape_gov_site(base_url):
    """Specialized scraper for government document portals"""
    try:
        time.sleep(DELAY)
        response = requests.get(base_url, headers=HEADERS, verify=False, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        pdf_links = set()  # Avoid duplicates

        # Government sites often use these patterns
        for link in soup.find_all('a', href=True):
            href = urljoin(base_url, link['href'])
            if is_gov_pdf(href):
                pdf_links.add(href)

        return sorted(pdf_links)[:10]  # Return first 10 unique PDFs

    except Exception as e:
        print(f"⚠️ Scraping error at {base_url}: {str(e)}")
        return []

def main():
    print("\n=== Government Document Collector ===")
    print(f"📂 Root directory: {os.path.abspath(BASE_DIR)}\n")
    setup_project_folders()

    for category, urls in CATEGORIES.items():
        print(f"\n🔍 Processing {category.upper()} documents:")
        
        for url in urls:
            print(f"   - Scanning: {url}")
            pdfs = scrape_gov_site(url)
            
            if not pdfs:
                print("   ⚠️ No PDFs found using standard methods")
                continue
                
            category_dir = os.path.join(BASE_DIR, category)
            for pdf_url in pdfs:
                # Generate clean filename
                filename = (
                    os.path.basename(pdf_url.split('?')[0])
                    or f"{category}_{int(time.time())}.pdf"
                )
                # Replace spaces and special chars
                filename = re.sub(r'[^\w\-.]', '_', filename)
                
                save_path = os.path.join(category_dir, filename)
                download_gov_pdf(pdf_url, save_path)
                time.sleep(DELAY)

if __name__ == "__main__":
    main()
    print("\n🏁 Collection complete! Files saved to project's data/raw folder")


=== Government Document Collector ===
📂 Root directory: /Users/gwin/Documents/Post Undergrad Work/Tax Search/data/raw

📁 Created: data/raw/healthcare
📁 Created: data/raw/defense

🔍 Processing HEALTHCARE documents:
   - Scanning: https://www.cms.gov/about-cms/agency-information/history
✅ Saved: data/raw/healthcare/Admin Tenure Bio 508 July 2015.pdf
✅ Saved: data/raw/healthcare/BushSignMMA2003.pdf
✅ Saved: data/raw/healthcare/CMS35thAnniversary.pdf
✅ Saved: data/raw/healthcare/CMSInBaltimore.pdf
✅ Saved: data/raw/healthcare/CMSPresidentsSpeeches.pdf
✅ Saved: data/raw/healthcare/MedicareMedicaidMilestones2015_508.pdf
✅ Saved: data/raw/healthcare/PresidentCMSMilestones.pdf
✅ Saved: data/raw/healthcare/QUIZ08.pdf
✅ Saved: data/raw/healthcare/Agent-Broker-Help-Desks Updates_051024_0.pdf
   - Scanning: https://www.medicaid.gov/about-us/reports-and-evaluations/index.html
⚠️ Scraping error at https://www.medicaid.gov/about-us/reports-and-evaluations/index.html: 404 Client Error: Not Found for 