In [21]:
import re
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# === CONFIGURATION ===
eurlex_url = 'https://eur-lex.europa.eu/legal-content/EN/NIM/?uri=CELEX:32003L0096'  # CHANGE THIS!

ammend = False  # Set to True if the directive is an amendment

output_dir = '/Users/nicolomarchini/Documents/Università/Magistrale/Tesi Magistrale/Directives'   

# === UTILITY FUNCTIONS ===
def standardize_date(date_str):
    if not date_str: return ""
    date_str = date_str.strip()
    match = re.match(r"^(\d{2})/(\d{2})/(\d{4})$", date_str)
    if match:
        day, month, year = match.groups()
        return f"{year}-{month}-{day}"
    match = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", date_str)
    if match:
        return date_str
    return date_str

def get_celex_from_url(url):
    m = re.search(r'CELEX:([0-9A-Z]+)', url, re.I)
    return m.group(1) if m else ""

def extract_dl_metadata(soup):
    metadata = {}
    multi_label_fields = {"Date of transposition"}
    for dl in soup.find_all("dl", class_="NMetadata"):
        dts = dl.find_all("dt")
        dds = dl.find_all("dd")
        for dt, dd in zip(dts, dds):
            label = dt.get_text(strip=True).replace(":", "")
            # Prefer human-readable <span> text if present
            spans = dd.find_all("span")
            if spans:
                value = ", ".join(span.get_text(strip=True) for span in spans)
            else:
                value = dd.get_text(" ", strip=True)
            value = value.split(";")[0].strip()
            if label in multi_label_fields:
                # Join all "Date of transposition" values
                if label in metadata:
                    if value not in metadata[label].split(", "):
                        metadata[label] += ", " + value
                else:
                    metadata[label] = value
            else:
                if label not in metadata:  # Only first occurrence
                    metadata[label] = value
    return metadata

# === 1. DOWNLOAD BOTH PAGES ===
celex = get_celex_from_url(eurlex_url)
nim_url = eurlex_url
all_url = eurlex_url.replace('/NIM/', '/ALL/')

headers = {'User-Agent': 'Mozilla/5.0'}

# Download NIM page (transposition)
nim_response = requests.get(nim_url, headers=headers)
nim_html = nim_response.text
nim_soup = BeautifulSoup(nim_html, "html.parser")

# Download ALL page (document metadata)
all_response = requests.get(all_url, headers=headers)
all_html = all_response.text
all_soup = BeautifulSoup(all_html, "html.parser")

# === 2. EXTRACT METADATA FROM ALL PAGE ===
all_metadata = extract_dl_metadata(all_soup)
print("Extracted document metadata:")
for k, v in all_metadata.items():
    print(f"  {k}: {v}")

# === 3. EXTRACT TRANSPOSTION MEASURES FROM NIM PAGE ===
eu_countries = {
    "AUT": "Austria", "BEL": "Belgium", "BGR": "Bulgaria", "HRV": "Croatia",
    "CYP": "Cyprus", "CZE": "Czechia", "DNK": "Denmark", "EST": "Estonia",
    "FIN": "Finland", "FRA": "France", "DEU": "Germany", "GRC": "Greece",
    "HUN": "Hungary", "IRL": "Ireland", "ITA": "Italy", "LVA": "Latvia",
    "LTU": "Lithuania", "LUX": "Luxembourg", "MLT": "Malta", "NLD": "Netherlands",
    "POL": "Poland", "PRT": "Portugal", "ROU": "Romania", "SVK": "Slovakia",
    "SVN": "Slovenia", "ESP": "Spain", "SWE": "Sweden"
}

data = []
li_list = nim_soup.find_all("li", class_=lambda x: x and x.endswith("_ntm"))
found_country_codes = set()
for li in li_list:
    li_classes = li.get("class")
    country_code = [c.split("_")[0] for c in li_classes if c.endswith("_ntm")][0]
    country_name = eu_countries.get(country_code, country_code)
    found_country_codes.add(country_code)

    # DEADLINE (try several locations)
    deadline = None
    search_node = li
    while search_node:
        prev = search_node.previous_sibling
        found = False
        while prev:
            if getattr(prev, "name", None) == "div" and "transposition" in prev.get("class", []):
                m = re.search(r"Transposition deadline[s]*:\s*([0-9/, ]+)", prev.get_text())
                if m:
                    deadline = m.group(1).strip()
                    found = True
                    break
            prev = prev.previous_sibling
        if found:
            break
        search_node = search_node.parent
        if getattr(search_node, "get", lambda x: None)("id") == country_code:
            break

    if not deadline:
        parent_panel = li.find_parent("div", class_="panel-collapse countryPanel")
        if parent_panel:
            deadline_div = parent_panel.find("div", id=f"{country_code}_transp")
            if deadline_div:
                m = re.search(r"Transposition deadline[s]*:\s*([0-9/, ]+)", deadline_div.get_text())
                if m:
                    deadline = m.group(1).split(",")[0].strip()
        if not deadline:
            row_div = nim_soup.find("div", class_="col-sm-12 ntmRow")
            if row_div:
                deadline_div = row_div.find("div", id=f"{country_code}_transposition")
                if deadline_div:
                    deadline = deadline_div.get_text(strip=True).split(",")[0].strip()

    title_tag = li.find("a", id="titleLink")
    measure_title = title_tag.text.strip() if title_tag else None

    publication_date = None
    pub_txt = li.get_text(" ", strip=True)
    m = re.search(r'Publication date:\s*([\d-]+)', pub_txt)
    if m:
        publication_date = standardize_date(m.group(1))
    else:
        for em in li.find_all("em"):
            if "Publication date" in em.parent.text:
                publication_date = standardize_date(em.text.strip())

    deadline_std = ""
    if deadline:
        deadlines = [standardize_date(x) for x in deadline.split(",")]
        deadline_std = ", ".join(deadlines)

    # Append measure row
    if measure_title:
        row = {
            "CELEX": celex,
            "Country Code": country_code,
            "Country": country_name,
            "Transposition Deadline": deadline_std,
            "Measure Title": measure_title,
            "Publication Date": publication_date,
            "Amending": ammend,

        }
        row.update(all_metadata)
        data.append(row)

# === 4. COUNTRIES WITH NO MEASURES ===
for code, name in eu_countries.items():
    if code not in found_country_codes:
        row = {
            "CELEX": celex,
            "Country Code": code,
            "Country": name,
            "Transposition Deadline": "",
            "Measure Title": "",
            "Publication Date": "",
            "Amending": ammend,

        }
        row.update(all_metadata)
        data.append(row)

# === 5. TO DATAFRAME/CSV ===
df = pd.DataFrame(data)
df.sort_values(by=["Country Code", "Transposition Deadline", "Measure Title"], inplace=True)

output_filename = f"transposition_{celex}.csv"
output_path = os.path.join(output_dir, output_filename)
df.to_csv(output_path, index=False)
print(f"Saved to {output_path}")
print(df.head(15))

Extracted document metadata:
  Date of document: 27/10/2003
  Date of effect: Entry into force, Date pub. See Art 31
  Date of transposition: At the latest See Art 28.1
  Date of end of validity: No end date
  Author: Council of the European Union
  Responsible body: Directorate-General for Taxation and Customs Union
  Form: Directive
  Addressee: The fifteen Member States: Belgium, Denmark, Germany, Ireland, Greece, Spain, France, Italy, Luxembourg, Netherlands, Austria, Portugal, Finland, Sweden, United Kingdom
  Additional information: CNS 1997/0111, EEA relevance
  Procedure number: 1997/0111/CNS
  Link: European Parliament - Legislative observatory ​
  Treaty: Treaty establishing the European Community
  Legal basis: 12002E093
  Proposal: 51997PC0030
  Modifies: Relation Act Comment Subdivision concerned From To Repeal 31992L0081 Repeal 31992L0082
  Modified by: Relation Act Comment Subdivision concerned From To Corrected by 32003L0096R(01) (ET) Corrected by 32003L0096R(02) (ET) C