# Policy Document Processing

This notebook handles the downloading of agricultural policy documents for Flanders, Wallonia, and France, and converts them into Markdown format for downstream LLM processing.

In [None]:
import os
import requests
import time
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, unquote, urlparse
from pypdf import PdfReader

# Configuration
DATA_DIR = "../data"
PDF_DIR = os.path.join(DATA_DIR, "pdfs")
MD_DIR = os.path.join(DATA_DIR, "markdown")

USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(MD_DIR, exist_ok=True)

## 1. Helper Functions

In [None]:
def sanitize_filename(text):
    """Creates a safe filename from URL or title."""
    if text.lower().endswith('.pdf'):
        text = text[:-4]
        
    keepcharacters = (' ','.','_','-')
    clean = "".join(c for c in text if c.isalnum() or c in keepcharacters).strip()
    return clean.replace(" ", "_") + ".pdf"

def download_file(url, folder, filename=None):
    try:
        if not filename:
            path_part = urlparse(url).path
            filename = unquote(path_part.split('/')[-1])
        
        if not filename.lower().endswith('.pdf'):
            filename += ".pdf"
            
        local_path = os.path.join(folder, filename)
        
        if os.path.exists(local_path):
            print(f"Skipping existing: {filename}")
            return local_path

        print(f"Downloading: {url}...")
        headers = {"User-Agent": USER_AGENT}
        with requests.get(url, stream=True, headers=headers, timeout=30) as r:
            r.raise_for_status()
            with open(local_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"Saved to {local_path}")
        time.sleep(1) # Be polite
        return local_path
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

## 2. Downloaders by Region

In [None]:
def scrape_flanders():
    print("\n--- Starting Flanders Download ---")
    folder = os.path.join(PDF_DIR, "flanders")
    os.makedirs(folder, exist_ok=True)
    
    target_url = "https://www.vlaanderen.be/landbouw/glb2023"
    
    try:
        print(f"Scraping {target_url}...")
        r = requests.get(target_url, headers={"User-Agent": USER_AGENT})
        if r.status_code == 404:
             print("Hit 404 on main link. Trying variants...")
             target_url = "https://lv.vlaanderen.be/nl/glb-2023-2027"
             r = requests.get(target_url, headers={"User-Agent": USER_AGENT})
             
        soup = BeautifulSoup(r.text, 'html.parser')
        links = soup.find_all('a', href=True)
        
        count = 0
        for link in links:
            href = link['href']
            full_url = urljoin(target_url, href)
            
            if full_url.lower().endswith('.pdf'):
                download_file(full_url, folder)
                count += 1
                if count > 5: break
                
        print(f"Flanders: Found {count} docs")
        
    except Exception as e:
        print(f"Flanders Error: {e}")

def scrape_wallonia():
    print("\n--- Starting Wallonia Download ---")
    folder = os.path.join(PDF_DIR, "wallonia")
    os.makedirs(folder, exist_ok=True)
    print("Wallonia: Automated download requires authenticated/complex portal access. Skipping.")

def scrape_france():
    print("\n--- Starting France Download ---")
    folder = os.path.join(PDF_DIR, "france")
    os.makedirs(folder, exist_ok=True)
    
    sources = [
        "https://agriculture.gouv.fr/pac-2023-2027-les-aides-du-plan-strategique-national-psn",
        "https://www.telepac.agriculture.gouv.fr/telepac/html/public/aides/formulaires-2023.html" 
    ]
    
    # Fallback to direct known PDF
    fallback_urls = [
         "https://agriculture.gouv.fr/sites/default/files/psn-pac-2023-2027-valide_ce-31082022.pdf"
    ]
    
    found_any = False
    
    for url in sources:
        try:
            print(f"Scraping {url}...")
            r = requests.get(url, headers={"User-Agent": USER_AGENT}, verify=False)
            soup = BeautifulSoup(r.text, 'html.parser')
            links = soup.find_all('a', href=True)
            
            count = 0
            for link in links:
                full_url = urljoin(url, link['href'])
                if full_url.lower().endswith('.pdf') and ("psn" in full_url or "guide" in full_url):
                     download_file(full_url, folder)
                     count += 1
                     found_any = True
                     if count > 5: break
            print(f"France: Found {count} docs at {url}")
        except Exception as e:
            print(f"France Error: {e}")

    if not found_any:
        print("Trying fallback URLs...")
        for url in fallback_urls:
            if download_file(url, folder):
                print("Fallback successful.")
                break

## 3. PDF to Markdown Conversion

In [None]:
def convert_pdf_to_markdown(filepath, output_dir):
    try:
        filename = os.path.basename(filepath)
        name_no_ext = os.path.splitext(filename)[0]
        output_path = os.path.join(output_dir, name_no_ext + ".md")
        
        # Importing pymupdf4llm for high quality conversion
        import pymupdf4llm
            
        print(f"Converting {filename} with pymupdf4llm...")
        md_text = pymupdf4llm.to_markdown(filepath)
        
        final_content = f"# {name_no_ext.replace('_', ' ')}\n\n"
        final_content += f"**Source File:** `{filename}`\n\n---\n\n"
        final_content += md_text

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(final_content)
            
        print(f"Converted: {filename} -> {output_path}")
    except Exception as e:
        print(f"Error converting {filepath}: {e}")

def batch_convert():
    print("\n--- Batch Conversion to Markdown (Improved) ---")
    for region in ['flanders', 'wallonia', 'france']:
        pdf_region_dir = os.path.join(PDF_DIR, region)
        if not os.path.exists(pdf_region_dir): continue
        
        md_region_dir = os.path.join(MD_DIR, region)
        os.makedirs(md_region_dir, exist_ok=True)
        
        for filename in os.listdir(pdf_region_dir):
            if filename.lower().endswith('.pdf'):
                convert_pdf_to_markdown(os.path.join(pdf_region_dir, filename), md_region_dir)

In [None]:
if __name__ == "__main__":
    scrape_flanders()
    scrape_france()
    scrape_wallonia()
    batch_convert()