In [None]:
import os

base_path = "/content/ECB_project"

folders = [
    "data/raw_html",
    "data/raw_text",
    "data/processed",
    "src",
    "logs"
]

for folder in folders:
    os.makedirs(os.path.join(base_path, folder), exist_ok=True)


In [None]:
!pip install requests beautifulsoup4 pandas



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [None]:
data = [
    {
        "meeting_date": "2025-12-18",
        "url": "https://www.ecb.europa.eu/press/press_conference/monetary-policy-statement/2025/html/ecb.is251218~3a10402adb.en.html"
    }
]

url_index = pd.DataFrame(data)
url_index["meeting_date"] = pd.to_datetime(url_index["meeting_date"])
url_index



Unnamed: 0,meeting_date,url
0,2025-12-18,https://www.ecb.europa.eu/press/press_conferen...


In [None]:
url_index_path = "/content/ECB_project/data/processed/ecb_url_index.csv"
url_index.to_csv(url_index_path, index=False)

print("URL index saved!")


URL index saved!


In [None]:
headers = {"User-Agent": "Mozilla/5.0"}

url = url_index.loc[0, "url"]

response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()

response.encoding = response.apparent_encoding

html = response.content.decode("utf-8", errors="replace")
print("Downloaded HTML, length:", len(html))


Downloaded HTML, length: 137016


In [None]:
date_str = url_index.loc[0, "meeting_date"].strftime("%Y-%m-%d")
html_path = f"/content/ECB_project/data/raw_html/{date_str}.html"


with open(html_path, "w", encoding="utf-8") as f:
    f.write(html)

print("Raw HTML saved!")


Raw HTML saved!


In [None]:
soup = BeautifulSoup(html, "html.parser")

main = soup.find("main")
article = main.find("article") if main else None
content_node = article if article else main

if content_node is None:
    raise ValueError("Could not find main/article content on the page")

# remove junk inside the content area
for tag in content_node.find_all(["script", "style", "nav", "header", "footer", "aside"]):
    tag.decompose()

text = content_node.get_text(separator="\n", strip=True)

print(text[:1200])


MONETARY POLICY STATEMENT
PRESS CONFERENCE
Christine Lagarde, President of the ECB,
Luis de Guindos, Vice-President of the ECB
Frankfurt am Main, 18 December 2025
Jump to the transcript of the questions and answers
Good afternoon, the Vice-President and I welcome you to our press conference.
The Governing Council today decided to keep the three key ECB interest rates unchanged. Our updated assessment reconfirms that inflation should stabilise at our two per cent target in the medium term.
The new Eurosystem staff projections show headline inflation averaging 2.1 per cent in 2025, 1.9 per cent in 2026, 1.8 per cent in 2027 and 2.0 per cent in 2028. For inflation excluding energy and food, staff project an average of 2.4 per cent in 2025, 2.2 per cent in 2026, 1.9 per cent in 2027 and 2.0 per cent in 2028. Inflation has been revised up for 2026, mainly because staff now expect services inflation to decline more slowly. Economic growth is expected to be stronger than in the September proj

In [None]:
text_lower = text.lower()

markers = [
    "we are now ready to take your questions",
    "we are now ready for your questions",
    "we are now at your disposal for questions",
    "i will now take your questions",
]


split_index = None
used_marker = None
for m in markers:
    i = text_lower.find(m)
    if i != -1:
        split_index = i
        used_marker = m
        break

if split_index is None:
    statement = text.strip()
    qa = ""
else:
    statement = text[:split_index].strip()
    qa = text[split_index:].strip()
# Remove the split marker sentence from the start of Q&A (optional but usually cleaner)
qa = qa.replace("We are now ready to take your questions.", "").strip()
qa = qa.replace("We are now at your disposal for questions.", "").strip()


stop_markers = [
    "Related topics",
    "Disclaimer",
    "CONTACT",
    "Reproduction is permitted",
]

def trim_footer(text):
    for m in stop_markers:
        idx = text.find(m)
        if idx != -1:
            return text[:idx].strip()
    return text.strip()

statement = trim_footer(statement)
qa = trim_footer(qa)


print("Marker used:", used_marker)
print("Statement length:", len(statement))
print("Q&A length:", len(qa))

print("\nSTATEMENT PREVIEW:\n", statement[:600])
print("\nQ&A PREVIEW:\n", qa[:600])


Marker used: we are now ready to take your questions
Statement length: 9897
Q&A length: 21602

STATEMENT PREVIEW:
 MONETARY POLICY STATEMENT
PRESS CONFERENCE
Christine Lagarde, President of the ECB,
Luis de Guindos, Vice-President of the ECB
Frankfurt am Main, 18 December 2025
Jump to the transcript of the questions and answers
Good afternoon, the Vice-President and I welcome you to our press conference.
The Governing Council today decided to keep the three key ECB interest rates unchanged. Our updated assessment reconfirms that inflation should stabilise at our two per cent target in the medium term.
The new Eurosystem staff projections show headline inflation averaging 2.1 per cent in 2025, 1.9 per cent 

Q&A PREVIEW:
 * * *
Two questions, President Lagarde. The first one is about interest rates, and the question is: Are we still in a good place? Whenever the interest rate moves next time, is it more likely to go up or down? Do you have clarity on that?
The second question is about e

In [None]:

statement = statement.replace("Jump to the transcript of the questions and answers", "").strip()

statement_path = f"/content/ECB_project/data/raw_text/{date_str}_statement.txt"
qa_path = f"/content/ECB_project/data/raw_text/{date_str}_qa.txt"

with open(statement_path, "w", encoding="utf-8") as f:
    f.write(statement)

with open(qa_path, "w", encoding="utf-8") as f:
    f.write(qa)

row = {
    "meeting_date": url_index.loc[0, "meeting_date"],
    "url": url,
    "statement_text_raw": statement,
    "qa_text_raw": qa,
    "statement_len": len(statement),
    "qa_len": len(qa),
    "split_marker_used": used_marker,
}

df = pd.DataFrame([row])
df

processed_path = "/content/ECB_project/data/processed/ecb_documents_raw.csv"
df["meeting_date"] = pd.to_datetime(df["meeting_date"]).dt.strftime("%Y-%m-%d")

df.to_csv(processed_path, index=False)
import os, time

print("processed_path =", processed_path)
print("exists?", os.path.exists(processed_path))
print("file size:", os.path.getsize(processed_path), "bytes")
print("modified time:", time.ctime(os.path.getmtime(processed_path)))

# read back with pandas (this is the truth)
check = pd.read_csv(processed_path, dtype=str)
print("read-back meeting_date:", check.loc[0, "meeting_date"])



print("Saved:", statement_path)
print("Saved:", qa_path)
check = pd.read_csv(processed_path)
print(check.loc[0, "meeting_date"])                # should be 2025-12-18
print(check.loc[0, "statement_text_raw"][:2000])    # should NOT contain â€™

with open(processed_path, "r", encoding="utf-8") as f:
    for _ in range(3):
        print(f.readline().strip())




processed_path = /content/ECB_project/data/processed/ecb_documents_raw.csv
exists? True
file size: 31753 bytes
modified time: Fri Dec 26 20:07:18 2025
read-back meeting_date: 2025-12-18
Saved: /content/ECB_project/data/raw_text/2025-12-18_statement.txt
Saved: /content/ECB_project/data/raw_text/2025-12-18_qa.txt
2025-12-18
MONETARY POLICY STATEMENT
PRESS CONFERENCE
Christine Lagarde, President of the ECB,
Luis de Guindos, Vice-President of the ECB
Frankfurt am Main, 18 December 2025

Good afternoon, the Vice-President and I welcome you to our press conference.
The Governing Council today decided to keep the three key ECB interest rates unchanged. Our updated assessment reconfirms that inflation should stabilise at our two per cent target in the medium term.
The new Eurosystem staff projections show headline inflation averaging 2.1 per cent in 2025, 1.9 per cent in 2026, 1.8 per cent in 2027 and 2.0 per cent in 2028. For inflation excluding energy and food, staff project an average of 2.

In [None]:
import re
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup

HEADERS = {"User-Agent": "Mozilla/5.0"}

JUNK_LINE_RE = re.compile(
    r"(?im)^\s*(jump to the transcript of the questions and answers|with the transcript of the questions and answers)\s*$"
)

TRANSCRIPT_START_RE = re.compile(r"(?im)^\s*transcript of the questions.*$")
QNA_HEADING_RE = re.compile(r"(?im)^\s*(questions and answers|q\s*&\s*a|q&a)\s*$")
QUESTION_PREFIX_RE = re.compile(r"(?im)^\s*question\s*:")

SPOKEN_MARKERS = [
    "we are now ready to take your questions",
    "we are now ready for your questions",
    "we are now at your disposal for questions",
    "i will now take your questions",
]

def download_html(url: str) -> str:
    r = requests.get(url, headers=HEADERS, timeout=40)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def extract_title(soup: BeautifulSoup) -> str:
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        return h1.get_text(" ", strip=True)
    if soup.title and soup.title.get_text(strip=True):
        return soup.title.get_text(" ", strip=True)
    return ""

def extract_main_text(soup: BeautifulSoup) -> str:
    main = soup.find("main")
    article = main.find("article") if main else None
    node = article or main or soup.body or soup

    for tag in node.find_all(["script", "style", "nav", "header", "footer", "aside"]):
        tag.decompose()

    text = node.get_text(separator="\n")
    text = re.sub(r"\r", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def remove_junk_lines(text: str) -> str:
    lines = text.split("\n")
    lines = [ln for ln in lines if not JUNK_LINE_RE.match(ln.strip())]
    return "\n".join(lines).strip()

def split_statement_qna_whole(text: str):
    text = remove_junk_lines(text)
    lines = text.split("\n")

    def find_line_index(pattern_re):
        for i, ln in enumerate(lines):
            if pattern_re.match(ln.strip()):
                return i
        return None

    i = find_line_index(TRANSCRIPT_START_RE)
    if i is not None:
        return "\n".join(lines[:i]).strip(), "\n".join(lines[i:]).strip()

    i = find_line_index(QNA_HEADING_RE)
    if i is not None:
        return "\n".join(lines[:i]).strip(), "\n".join(lines[i:]).strip()

    i = find_line_index(QUESTION_PREFIX_RE)
    if i is not None:
        return "\n".join(lines[:i]).strip(), "\n".join(lines[i:]).strip()

    low = text.lower()
    for m in SPOKEN_MARKERS:
        idx = low.find(m)
        if idx != -1:
            return text[:idx].strip(), text[idx:].strip()

    return text.strip(), ""

def scrape_one(url: str):
    html = download_html(url)
    soup = BeautifulSoup(html, "html.parser")

    title = extract_title(soup)
    main_text = extract_main_text(soup)
    statement, qa = split_statement_qna_whole(main_text)

    return title, statement, qa

def excel_safe_multiline(s: str) -> str:
    """
    Replace real newlines inside a cell with the literal characters \n
    so Excel doesn't display them as extra rows when opening CSV.
    """
    if s is None:
        return ""
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    return s.replace("\n", r"\n")

def main():
    url_index_path = "ecb_monetary_policy_statements_all_years (1).csv"
    df = pd.read_csv(url_index_path)

    if "meeting_date" not in df.columns or "url" not in df.columns:
        raise ValueError("Your driver CSV must have columns: meeting_date, url")

    out = []
    for _, row in df.iterrows():
        meeting_date = str(row["meeting_date"])[:10]
        url = row["url"]

        try:
            title, statement, qa = scrape_one(url)
        except Exception as e:
            print(f"[FAIL] {meeting_date} {url} -> {e}")
            title = statement = qa = ""

        out.append({
            "meeting_date": meeting_date,
            "url": url,
            "title": title,
            "statement_text": excel_safe_multiline(statement),
            "qa_text": excel_safe_multiline(qa),
        })

    out_df = pd.DataFrame(out)

    # IMPORTANT: quote all fields so Excel behaves better
    out_df.to_csv(
        "ecb_statement_qna_raw.csv",
        index=False,
        quoting=csv.QUOTE_ALL
    )

    print("Saved: ecb_statement_qna_raw.csv")
    print("Rows:", len(out_df))

if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'ecb_monetary_policy_statements_all_years (1).csv'

In [None]:
import pandas as pd
import re

# load raw CSV
df = pd.read_csv("ecb_statement_qna_raw.csv", dtype=str)

# turn literal \n back into real newlines
for col in ["statement_text", "qa_text"]:
    df[col] = df[col].fillna("").str.replace(r"\n", "\n", regex=False)

# --- STATEMENT CLEANING ---
def clean_statement(s):
    lines = [ln.strip() for ln in s.split("\n") if ln.strip()]

    # drop very generic header lines only at the top
    drop_re = re.compile(
        r"^(ecb|european central bank).*press conference$|"
        r"^monetary policy statement$|"
        r"^introductory statement$",
        re.I
    )

    while lines and drop_re.match(lines[0]):
        lines.pop(0)

    return "\n".join(lines)

# --- Q&A CLEANING ---
def clean_qa(q):
    lines = [ln.strip() for ln in q.split("\n") if ln.strip()]

    # remove transcript intro line if present
    lines = [
        ln for ln in lines
        if not re.match(r"^transcript of the questions", ln, re.I)
    ]

    return "\n".join(lines)

df["statement_text_clean"] = df["statement_text"].apply(clean_statement)
df["qa_text_clean"] = df["qa_text"].apply(clean_qa)

# make Excel-safe again
for col in ["statement_text_clean", "qa_text_clean"]:
    df[col] = df[col].str.replace("\n", r"\n", regex=False)

df.to_excel("ecb_statement_qna_cleaned.xlsx", index=False)


print("Saved: ecb_statement_qna_cleaned.csv")
print("Rows:", len(df))


In [None]:
import pandas as pd

hicp = pd.read_csv("hicp.tsv.gz", sep="\t")
hicp = hicp.rename(columns={hicp.columns[0]: "meta"})

# EXACT row: Monthly, YoY, All-items, Euro Area (EA19)
ea19 = hicp[hicp["meta"] == "M,RCH_A,AP,EA19"]

# sanity check (must be exactly 1 row)
assert len(ea19) == 1

# wide → long
ea19 = ea19.melt(id_vars="meta", var_name="date", value_name="inflation_raw")

# clean values
ea19["inflation"] = (
    ea19["inflation_raw"]
    .replace(":", pd.NA)
    .astype(str)
    .str.replace(" d", "", regex=False)
    .str.strip()
)

ea19["inflation"] = pd.to_numeric(ea19["inflation"], errors="coerce")
ea19["date"] = pd.to_datetime(ea19["date"].astype(str).str.strip(), format="%Y-%m")

ea19 = ea19[["date", "inflation"]].dropna()

ea19.to_csv("hicp_inflation_ea19.csv", index=False)

ea19.head(), ea19["date"].min(), ea19["date"].max()

In [None]:
ea = hicp[hicp["meta"] == "M,RCH_A,AP,EA"].melt(id_vars="meta", var_name="date", value_name="inflation_raw")

ea["inflation"] = (ea["inflation_raw"].replace(":", pd.NA).astype(str).str.replace(" d","", regex=False).str.strip())
ea["inflation"] = pd.to_numeric(ea["inflation"], errors="coerce")
ea["date"] = pd.to_datetime(ea["date"].astype(str).str.strip(), format="%Y-%m")

ea = ea[["date","inflation"]].dropna()
ea["date"].min(), ea.head()


In [None]:
import pandas as pd

EA11_1999 = ["AT","BE","DE","ES","FI","FR","IE","IT","LU","NL","PT"]

hicp = pd.read_csv("hicp.tsv", sep="\t", dtype=str)
hicp.columns = hicp.columns.str.strip()
hicp = hicp.rename(columns={hicp.columns[0]: "meta"})

# FIX: strip geo codes
hicp["geo"] = hicp["meta"].str.split(",").str[-1].str.strip()

df = hicp[hicp["geo"].isin(EA11_1999)].copy()
print("Rows kept:", len(df), "Unique geos:", sorted(df["geo"].unique()))

long = df.melt(id_vars=["meta","geo"], var_name="date", value_name="infl_raw")

long["infl"] = (
    long["infl_raw"].astype(str).str.strip()
    .str.replace(r"\s+[A-Za-z]+$", "", regex=True)   # drop flags like " d"
)

long.loc[long["infl"].str.match(r"^:\s*.*$"), "infl"] = pd.NA
long["infl"] = pd.to_numeric(long["infl"], errors="coerce")

# date col names are like "1999-01"
long["date"] = pd.to_datetime(long["date"].str.strip(), format="%Y-%m", errors="coerce")

ea_proxy = (
    long.dropna(subset=["infl","date"])
        .groupby("date", as_index=False)["infl"]
        .mean()
        .rename(columns={"infl":"inflation"})
)

ea_proxy = ea_proxy[ea_proxy["date"] >= "1999-01-01"].reset_index(drop=True)

print("EA proxy rows:", len(ea_proxy))
print(ea_proxy.head(12))


In [None]:
import pandas as pd

# --- 1) Your proxy (already computed) ---
# ea_proxy has columns: ["date","inflation"]
ea_proxy = ea_proxy.copy()
ea_proxy["date"] = pd.to_datetime(ea_proxy["date"])
ea_proxy = ea_proxy.sort_values("date")

# --- 2) Official EA19 from the same TSV ---
hicp = pd.read_csv("hicp.tsv", sep="\t", dtype=str)
hicp.columns = hicp.columns.str.strip()
hicp = hicp.rename(columns={hicp.columns[0]: "meta"})

ea19 = (
    hicp[hicp["meta"].str.strip() == "M,RCH_A,AP,EA19"]
    .melt(id_vars="meta", var_name="date", value_name="inflation_raw")
)

ea19["inflation"] = (
    ea19["inflation_raw"].astype(str).str.strip()
    .str.replace(r"\s+[A-Za-z]+$", "", regex=True)   # remove flags like " d"
)

ea19.loc[ea19["inflation"].str.match(r"^:\s*.*$"), "inflation"] = pd.NA
ea19["inflation"] = pd.to_numeric(ea19["inflation"], errors="coerce")
ea19["date"] = pd.to_datetime(ea19["date"].str.strip(), format="%Y-%m", errors="coerce")

ea19 = ea19[["date","inflation"]].dropna().sort_values("date")

# --- 3) Stitch: proxy before EA19 starts, EA19 afterwards ---
switch_date = ea19["date"].min()   # should be 2001-12-01

inflation_1999_2025 = pd.concat([
    ea_proxy[ea_proxy["date"] < switch_date][["date","inflation"]],
    ea19[ea19["date"] >= switch_date][["date","inflation"]],
], ignore_index=True).sort_values("date")

# keep exactly 1999-01 .. 2025-12 (or last available month)
inflation_1999_2025 = inflation_1999_2025[
    (inflation_1999_2025["date"] >= "1999-01-01") &
    (inflation_1999_2025["date"] <= "2025-12-01")
].reset_index(drop=True)

# --- 4) Save ---
inflation_1999_2025.to_csv("inflation_1999_2025.csv", index=False)

print("Saved:", inflation_1999_2025["date"].min(), "→", inflation_1999_2025["date"].max(),
      "| rows:", len(inflation_1999_2025))
print(inflation_1999_2025.head())
print(inflation_1999_2025.tail())
