This is the **firts script to run** in the workflow.  

# Scrape EUR-Lex Transposition Data for a Directive

This script downloads and processes **National Implementing Measures (NIMs)** for a specific EU directive from EUR-Lex.  
It extracts both **directive-level metadata** (from the `ALL` page) and **country-level measures** (from the `NIM` page), and produces a standardized CSV.

In [None]:
import re
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# === CONFIGURATION ===
# URL of the directive’s NIM (National Implementing Measures) page on EUR-Lex.
# Example provided: Directive 32003L0096. Replace with another NIM link if needed.
eurlex_url = 'https://eur-lex.europa.eu/legal-content/EN/NIM/?uri=CELEX:32003L0096'  

# Whether the directive is an amendment (set as a flag in the dataset).
ammend = False   

# Local directory where the final CSV will be saved.
output_dir = 'insert/your/path'  # ! Replace with your destination folder.

# === UTILITY FUNCTIONS ===
def standardize_date(date_str):
    """
    Try to standardize dates into YYYY-MM-DD format.
    - Handles both DD/MM/YYYY and YYYY-MM-DD formats.
    - Returns an empty string if no valid format is found.
    """
    if not date_str: return ""
    date_str = date_str.strip()
    match = re.match(r"^(\d{2})/(\d{2})/(\d{4})$", date_str)  # format: DD/MM/YYYY
    if match:
        day, month, year = match.groups()
        return f"{year}-{month}-{day}"
    match = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date_str)  # format: YYYY-MM-DD
    if match:
        return date_str
    return date_str

def get_celex_from_url(url):
    """
    Extract the CELEX number from a EUR-Lex URL.
    Example: "...CELEX:32003L0096" → "32003L0096".
    """
    m = re.search(r'CELEX:([0-9A-Z]+)', url, re.I)
    return m.group(1) if m else ""

def extract_dl_metadata(soup):
    """
    Parse the ALL page of a directive to extract directive-level metadata.
    Metadata on EUR-Lex is typically stored in <dl class="NMetadata"> blocks.
    - Collects <dt> labels and their corresponding <dd> values.
    - Prefers <span> text when available (cleaner).
    - Handles multiple values for "Date of transposition".
    """
    metadata = {}
    multi_label_fields = {"Date of transposition"}  # these can appear multiple times

    for dl in soup.find_all("dl", class_="NMetadata"):
        dts = dl.find_all("dt")
        dds = dl.find_all("dd")
        for dt, dd in zip(dts, dds):
            label = dt.get_text(strip=True).replace(":", "")
            spans = dd.find_all("span")
            if spans:
                # Multiple <span> values → join them
                value = ", ".join(span.get_text(strip=True) for span in spans)
            else:
                value = dd.get_text(" ", strip=True)
            # Keep only the first chunk if separated by ";"
            value = value.split(";")[0].strip()
            if label in multi_label_fields:
                # Append additional values if they exist
                if label in metadata:
                    if value not in metadata[label].split(", "):
                        metadata[label] += ", " + value
                else:
                    metadata[label] = value
            else:
                # Only keep the first occurrence for unique fields
                if label not in metadata:
                    metadata[label] = value
    return metadata

# === 1. DOWNLOAD NIM + ALL PAGES ===
# Extract CELEX code from URL for naming outputs
celex = get_celex_from_url(eurlex_url)

# Define NIM (transposition measures) and ALL (directive metadata) URLs
nim_url = eurlex_url
all_url = eurlex_url.replace('/NIM/', '/ALL/')

# Set headers to mimic a browser (avoid blocking by EUR-Lex)
headers = {'User-Agent': 'Mozilla/5.0'}

# Download the NIM page (country-level transposition info)
nim_response = requests.get(nim_url, headers=headers)
nim_html = nim_response.text
nim_soup = BeautifulSoup(nim_html, "html.parser")

# Download the ALL page (general directive metadata)
all_response = requests.get(all_url, headers=headers)
all_html = all_response.text
all_soup = BeautifulSoup(all_html, "html.parser")

# === 2. EXTRACT METADATA FROM ALL PAGE ===
# Collect directive-level metadata (applies to all countries).
all_metadata = extract_dl_metadata(all_soup)
print("Extracted document metadata:")
for k, v in all_metadata.items():
    print(f"  {k}: {v}")

# === 3. EXTRACT NATIONAL MEASURES FROM NIM PAGE ===
# List of EU countries (ISO-3 codes → country names).
# UK is excluded since no longer part of the EU.
eu_countries = {
    "AUT": "Austria", "BEL": "Belgium", "BGR": "Bulgaria", "HRV": "Croatia",
    "CYP": "Cyprus", "CZE": "Czechia", "DNK": "Denmark", "EST": "Estonia",
    "FIN": "Finland", "FRA": "France", "DEU": "Germany", "GRC": "Greece",
    "HUN": "Hungary", "IRL": "Ireland", "ITA": "Italy", "LVA": "Latvia",
    "LTU": "Lithuania", "LUX": "Luxembourg", "MLT": "Malta", "NLD": "Netherlands",
    "POL": "Poland", "PRT": "Portugal", "ROU": "Romania", "SVK": "Slovakia",
    "SVN": "Slovenia", "ESP": "Spain", "SWE": "Sweden"
}

data = []  # list of rows that will form the dataset
# Each <li> with class ending in "_ntm" corresponds to a national transposition measure
li_list = nim_soup.find_all("li", class_=lambda x: x and x.endswith("_ntm"))
found_country_codes = set()  # to track countries that have measures

for li in li_list:
    # Extract country code (e.g., "FRA" from "FRA_ntm") and map to country name
    li_classes = li.get("class")
    country_code = [c.split("_")[0] for c in li_classes if c.endswith("_ntm")][0]
    country_name = eu_countries.get(country_code, country_code)
    found_country_codes.add(country_code)

    # --- Extract transposition deadline ---
    # The deadline may appear in different places, so try several strategies.
    deadline = None
    search_node = li
    while search_node:
        prev = search_node.previous_sibling
        found = False
        while prev:
            # Look in preceding <div> with class containing "transposition"
            if getattr(prev, "name", None) == "div" and "transposition" in prev.get("class", []):
                m = re.search(r"Transposition deadline[s]*:\s*([0-9/, ]+)", prev.get_text())
                if m:
                    deadline = m.group(1).strip()
                    found = True
                    break
            prev = prev.previous_sibling
        if found:
            break
        # Move upward in the DOM if not found
        search_node = search_node.parent
        if getattr(search_node, "get", lambda x: None)("id") == country_code:
            break

    # Alternative location: inside country panel
    if not deadline:
        parent_panel = li.find_parent("div", class_="panel-collapse countryPanel")
        if parent_panel:
            deadline_div = parent_panel.find("div", id=f"{country_code}_transp")
            if deadline_div:
                m = re.search(r"Transposition deadline[s]*:\s*([0-9/, ]+)", deadline_div.get_text())
                if m:
                    deadline = m.group(1).split(",")[0].strip()
    # Last fallback: inside generic row section
    if not deadline:
        row_div = nim_soup.find("div", class_="col-sm-12 ntmRow")
        if row_div:
            deadline_div = row_div.find("div", id=f"{country_code}_transposition")
            if deadline_div:
                deadline = deadline_div.get_text(strip=True).split(",")[0].strip()

    # --- Extract measure title (name of national implementing act) ---
    title_tag = li.find("a", id="titleLink")
    measure_title = title_tag.text.strip() if title_tag else None

    # --- Extract publication date ---
    publication_date = None
    pub_txt = li.get_text(" ", strip=True)
    m = re.search(r'Publication date:\s*([\d-]+)', pub_txt)
    if m:
        publication_date = standardize_date(m.group(1))
    else:
        # Sometimes stored in <em> tags
        for em in li.find_all("em"):
            if "Publication date" in em.parent.text:
                publication_date = standardize_date(em.text.strip())

    # --- Standardize deadline (multiple deadlines → join) ---
    deadline_std = ""
    if deadline:
        deadlines = [standardize_date(x) for x in deadline.split(",")]
        deadline_std = ", ".join(deadlines)

    # --- Append row to dataset if measure exists ---
    if measure_title:
        row = {
            "CELEX": celex,
            "Country Code": country_code,
            "Country": country_name,
            "Transposition Deadline": deadline_std,
            "Measure Title": measure_title,
            "Publication Date": publication_date,
            "Amending": ammend,
        }
        # Add directive-level metadata to row
        row.update(all_metadata)
        data.append(row)

# === 4. ADD COUNTRIES WITH NO MEASURES ===
# For countries with no entry on the NIM page, add an empty row with metadata.
for code, name in eu_countries.items():
    if code not in found_country_codes:
        row = {
            "CELEX": celex,
            "Country Code": code,
            "Country": name,
            "Transposition Deadline": "",
            "Measure Title": "",
            "Publication Date": "",
            "Amending": ammend,
        }
        row.update(all_metadata)
        data.append(row)

# === 5. SAVE DATASET TO CSV ===
# Convert to DataFrame, sort for consistency, and save to disk.
df = pd.DataFrame(data)
df.sort_values(by=["Country Code", "Transposition Deadline", "Measure Title"], inplace=True)

output_filename = f"transposition_{celex}.csv"
output_path = os.path.join(output_dir, output_filename)
df.to_csv(output_path, index=False)

print(f"Saved to {output_path}")
print(df.head(15))