# Main Pipeline -- EV Adoption Analysis (2015–2024)

## Purpose
This notebook orchestrates the **core data analytics pipeline** of the project.
It loads the merged canton-level panel dataset and serves as the main entry point
for descriptive analysis, preprocessing and model execution.

## Key Steps
- Load the cleaned canton-year panel dataset
- Verify data integrity and structure
- Prepare features for downstream modeling
- Call modeling routines and aggregate results

## Inputs
- `data/intermediate/master_panel_2015_2024.csv`

## Outputs
- Model performance tables (R², RMSE, MAE)
- Figures saved to `data/outputs/figures/`
- Tables saved to `data/outputs/tables/`

## Execution
This notebook is executed automatically via `run_pipeline.py`.

In [11]:
from pathlib import Path

cwd = Path.cwd().resolve()

# case 1: launched from the root
if (cwd / "data" / "raw").exists():
    ROOT = cwd
# case 2: launched from notebooks/
elif (cwd.parent / "data" / "raw").exists():
    ROOT = cwd.parent
else:
    raise FileNotFoundError(
        f"Project root not found from cwd={cwd}. "
        "Expected to find data/raw in cwd or in its parent."
    )

DATA = ROOT / "data"
RAW = DATA / "raw"
ELCOM_RAW = RAW / "elcom"
INTER = DATA / "intermediate"
OUT = DATA / "outputs"

INTER.mkdir(parents=True, exist_ok=True)
OUT.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)
print("RAW :", RAW)
print("RAW files:", sorted([p.name for p in RAW.iterdir()])[:50])


ROOT: C:\Users\hamza\OneDrive\Desktop\projet-ada-hk
RAW : C:\Users\hamza\OneDrive\Desktop\projet-ada-hk\data\raw
RAW files: ['Bilan_pop_CH (old).xlsx', 'Bilan_pop_CH.xlsx', 'ElecProd_ByYear.xlsx', 'canton_climate_co2.xlsx', 'elcom', 'ev_registrations_per_canton.csv', 'gdp_per_capita.xlsx', 'motorisation_rate.xlsx', 'policy_parties_cantons.xlsx']


In [None]:
# ============================================================
# OUTPUT FOLDERS (FIGURES + TABLES)
# ============================================================
from typing import Optional

FIG_DIR = OUT / "figures"
TAB_DIR = OUT / "tables"
FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR.mkdir(parents=True, exist_ok=True)

def save_fig(filename: str, dpi: int = 200):
    """Save current matplotlib figure to Outputs/figures/"""
    path = FIG_DIR / filename
    plt.tight_layout()
    plt.savefig(path, dpi=dpi, bbox_inches="tight")
    print(f" Figure saved: {path}")
    plt.close()

def save_table(df, filename_csv: str, filename_xlsx: Optional[str] = None):
    """Save a DataFrame to Outputs/tables/ as CSV (+ optional XLSX)."""
    csv_path = TAB_DIR / filename_csv
    df.to_csv(csv_path, index=False)
    print(f" Table saved: {csv_path}")
    if filename_xlsx:
        xlsx_path = TAB_DIR / filename_xlsx
        df.to_excel(xlsx_path, index=False)   
        print(f" Table saved: {xlsx_path}")

In [13]:
import pandas as pd
from pathlib import Path
import csv
xlsx_path = RAW / "ElecProd_ByYear.xlsx"
df = pd.read_excel(xlsx_path, dtype=str)

df = df.rename(columns={
    "Year": "year",
    "Production brute": "production_gross",
    "Production nette": "production_net"
})
years = []
for val in df["year"]:
    val = str(val)
    year = "".join([c for c in val if c.isdigit()])
    if len(year) == 4:
        years.append(int(year))
    else:
        years.append(None)
df["year"] = years
df = df[[y is not None for y in df["year"]]]  
def to_float(x):
    if pd.isna(x):
        return None
    x = str(x).replace(" ", "").replace("\xa0", "").replace(",", ".")
    try:
        return float(x)
    except:
        return None

df["production_gross"] = [to_float(x) for x in df["production_gross"]]
df["production_net"] = [to_float(x) for x in df["production_net"]]

import csv
df = df[pd.notna(df["year"])].copy()
df["year"] = df["year"].astype(int)
df = df.sort_values("year").reset_index(drop=True)
full_path = INTER / "elecprod_byyear_clean_full.csv"
subset_path = INTER / "elecprod_byyear_2015_2024.csv"
with open(full_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["year", "production_gross", "production_net"])
    for row in df.itertuples(index=False):
        writer.writerow([row.year, row.production_gross, row.production_net])
with open(subset_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["year", "production_gross", "production_net"])
    for row in df.itertuples(index=False):
        try:
            y = int(row.year)
            if 2015 <= y <= 2024:
                writer.writerow([row.year, row.production_gross, row.production_net])
        except:
            continue  

print("Here are the file path to the electricity production files used in our analysis, with only valid lines.")
print("-", full_path.name)
print("-", subset_path.name)
print("Years going from", df['year'].min(), "→", df['year'].max())
print("Total number of lines :", len(df))


Here are the file path to the electricity production files used in our analysis, with only valid lines.
- elecprod_byyear_clean_full.csv
- elecprod_byyear_2015_2024.csv
Years going from 1970 → 2024
Total number of lines : 55


In [14]:
csv_path = RAW / "ev_registrations_per_canton.csv"
df = pd.read_csv(csv_path, dtype=str)
df = df.rename(columns={
    "canton": "canton",
    "year": "year",
    "reg_total": "reg_total",
    "ev_reg_count": "ev_reg_count",
    "ev_reg_share": "ev_reg_share",
})

# canton: trim + multiple spaces -> simple

df["canton"] = df["canton"].astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
years = []
for v in df["year"]:
    y = "".join(c for c in str(v) if c.isdigit())
    years.append(int(y) if len(y) == 4 else None)
df["year"] = years
df = df[[y is not None for y in df["year"]]].copy()
df["year"] = df["year"].astype(int)
def to_float(x):
    x = "" if pd.isna(x) else str(x)
    x = x.replace("\xa0", "").replace(" ", "").replace(",", ".")
    try:
        return float(x)
    except:
        return None

for col in ["reg_total", "ev_reg_count", "ev_reg_share"]:
    df[col] = [to_float(x) for x in df[col]]
df = df[["canton", "year", "reg_total", "ev_reg_count", "ev_reg_share"]].sort_values(["canton","year"]).reset_index(drop=True)

full_path   = INTER / "ev_registrations_per_canton_clean_full.csv"
subset_path = INTER / "ev_registrations_per_canton_2015_2024.csv"

with open(full_path, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(df.columns)
    for r in df.itertuples(index=False):
        w.writerow([r.canton, r.year, r.reg_total, r.ev_reg_count, r.ev_reg_share])

with open(subset_path, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(df.columns)
    for r in df.itertuples(index=False):
        if 2015 <= r.year <= 2024:
            w.writerow([r.canton, r.year, r.reg_total, r.ev_reg_count, r.ev_reg_share])

print("Saved:", full_path.name, "|", subset_path.name)
print("We have", df['canton'].nunique(), "cantons and", len(df), "rows.")
print("Years going from", df['year'].min(), "→", df['year'].max())


Saved: ev_registrations_per_canton_clean_full.csv | ev_registrations_per_canton_2015_2024.csv
We have 27 cantons and 540 rows.
Years going from 2005 → 2024


In [None]:
# ELCOM (from 2014 to 2025): column detection & cleaning
from pathlib import Path
import pandas as pd
import csv, re

ROOT = Path.cwd().resolve()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

DATA_SRC = ELCOM_RAW
DATA_INTER = ROOT / "data" / "intermediate"
DATA_INTER.mkdir(parents=True, exist_ok=True)
YEAR_MIN, YEAR_MAX = 2014, 2025

def norm(s):
    return re.sub(r"\s+", "", str(s).strip().lower())

def find_col(cols, patterns):
    """Returns the first column whose name matches one of the patterns (regex on normalised name)."""
    ncols = {norm(c): c for c in cols}
    for pat in patterns:
        rx = re.compile(pat)
        for n,c in ncols.items():
            if rx.search(n):
                return c
    return None

def find_price_col(cols):
    # variations/misspellings of "grid usage after discount (cts./kWh)"
    pats = [
        r"gridusage.*after.*discount.*cts.?/?.?kwh",
        r"after.*discount.*cts.?/?.?kwh",
        r"after.*discount",  # fallback
    ]
    return find_col(cols, pats)

def find_period_col(cols):
    return find_col(cols, [r"^period$"])

def find_operatorlabel_col(cols):
    # "operatorlabel", "operator label", etc.
    return find_col(cols, [r"operator.*label"])

def find_gridusagename_col(cols):
    # "gridusagename", "grid usage name", etc.
    return find_col(cols, [r"grid.*usage.*name", r"gridusagename"])

def to_float(x):
    x = "" if pd.isna(x) else str(x)
    x = x.replace("\xa0","").replace(" ","").replace(",", ".")
    try:
        return float(x)
    except:
        return None

rows = []  # [year, operatorLabel, gridusagename, price]
for path in sorted(DATA_SRC.glob("*.csv")):
    df = pd.read_csv(path, dtype=str)
    pc  = find_price_col(df.columns)
    per = find_period_col(df.columns)
    opl = find_operatorlabel_col(df.columns)
    gnm = find_gridusagename_col(df.columns)

    if pc is None or per is None:
        print(f"[WARN] {path.name}: missing key columns (price/period) → ignored")
        continue

    n = len(df)
    for i in range(n):
        # année
        y_raw = str(df.iloc[i][per]) if per in df.columns else ""
        y_digits = "".join(ch for ch in y_raw if ch.isdigit())
        year = int(y_digits) if len(y_digits) == 4 else None
        if year is None or not (YEAR_MIN <= year <= YEAR_MAX):
            continue

        op  = df.iloc[i][opl] if opl in df.columns else None
        grid= df.iloc[i][gnm] if gnm in df.columns else None
        pr  = to_float(df.iloc[i][pc])

        # only keeps plausible prices (0–100 pence/kWh)
        if pr is None or not (0 < pr < 100):
            continue

        rows.append([year, op, grid, pr])

print(f"Valid lines collected : {len(rows)}")

# Sorting & outputs
rows.sort(key=lambda r: (r[0], str(r[1]) if r[1] else "", str(r[2]) if r[2] else ""))

raw_out = DATA_INTER / "elcom_prices_raw_minimal_2014_2025.csv"
agg_out = DATA_INTER / "elcom_prices_by_operator_year_2014_2025.csv"

# (a) all the lines
with open(raw_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["year","operatorLabel","gridusagename","price_after_discount_cts_per_kwh"])
    w.writerows(rows)

# (b) Average per (year, operatorLabel)
agg = {}
for year, op, grid, price in rows:
    key = (year, op)
    if key not in agg:
        agg[key] = [0.0, 0]
    agg[key][0] += price
    agg[key][1] += 1

with open(agg_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["year","operatorLabel","avg_price_after_discount_cts_per_kwh","n_tariffs"])
    for (year, op), (s, c) in sorted(agg.items(), key=lambda x: (x[0][0], str(x[0][1]))):
        w.writerow([year, op, round(s/c, 6) if c else None, c])

print("Files :", raw_out.name, "|", agg_out.name)


 Valid lines collected : 143402
Files : elcom_prices_raw_minimal_2014_2025.csv | elcom_prices_by_operator_year_2014_2025.csv


In [16]:
# Motorisation rate (per 1000 inhabitants)
import re, csv
import pandas as pd
from pathlib import Path

YEAR_MIN, YEAR_MAX = 2015, 2024  # same analysis window as the rest

xlsx_path = RAW / "motorisation_rate.xlsx"
assert xlsx_path.exists(), f"File not found: {xlsx_path}"
df = pd.read_excel(xlsx_path, dtype=str)

assert not df.empty, "motorisation_rate.xlsx : blank or incorrect sheet."

# 1) Detection of columns
first_col = df.columns[0]
year_cols = []
for c in df.columns[1:]:
    s = re.sub(r"\D", "", str(c))  
    if len(s) == 4:
        year_cols.append(c)

assert year_cols, "No year column detected (with 4 numbers)."

# 2) Subset & melt (wide -> long)
df = df[[first_col] + year_cols].copy()
df = df.rename(columns={first_col: "unit"})

long_df = df.melt(id_vars=["unit"], value_vars=year_cols,
                  var_name="year", value_name="motorization_rate_raw")

# 3) Simple cleaning
def clean_text(x):
    return re.sub(r"\s+", " ", str(x).strip())

long_df["unit"] = long_df["unit"].apply(clean_text)
long_df["year"] = long_df["year"].apply(lambda x: int(re.sub(r"\D", "", str(x))) if re.sub(r"\D", "", str(x)) else None)

# Values: remove non-breaking spaces, replace decimal point, drop markers (*, …, .)
def to_float(x):
    if pd.isna(x): return None
    s = str(x).replace("\xa0","").replace(" ", "")
    s = s.replace("…","").replace("...","").replace("*","").replace(",", ".")
    s = re.sub(r"[^0-9.\-]", "", s)
    try:
        return float(s) if s != "" else None
    except:
        return None

long_df["motorization_rate_per_1000"] = long_df["motorization_rate_raw"].apply(to_float)

# 4) Remove aggregates (Total, Regions, Switzerland, etc.)
AGG_RX = re.compile(r"^(total|région|region|espace|suisse)\b", flags=re.IGNORECASE)
clean_df = long_df[~long_df["unit"].str.match(AGG_RX, na=False)].copy()

# 5) Mapping of cantons through ISO-2 code (with common variants)
CANTON_MAP = {
    "Zurich":"ZH","Zürich":"ZH",
    "Berne":"BE","Bern":"BE",
    "Lucerne":"LU","Luzern":"LU",
    "Uri":"UR",
    "Schwyz":"SZ",
    "Obwald":"OW","Obwalden":"OW",
    "Nidwald":"NW","Nidwalden":"NW",
    "Glaris":"GL","Glarus":"GL",
    "Zoug":"ZG","Zug":"ZG",
    "Fribourg":"FR","Freiburg":"FR",
    "Soleure":"SO","Solothurn":"SO",
    "Bâle-Ville":"BS","Basel-Stadt":"BS","Basel Stadt":"BS",
    "Bâle-Campagne":"BL","Basel-Landschaft":"BL","Basel Landschaft":"BL",
    "Schaffhouse":"SH","Schaffhausen":"SH",
    "Appenzell Rh.-Ext":"AR","Appenzell Ausserrhoden":"AR","Appenzell Rh.-Ext.":"AR",
    "Appenzell Rh.-Int":"AI","Appenzell Innerrhoden":"AI","Appenzell Rh.-Int.":"AI",
    "Saint-Gall":"SG","St. Gallen":"SG","Sankt Gallen":"SG",
    "Grisons":"GR","Graubünden":"GR","Grigioni":"GR",
    "Argovie":"AG","Aargau":"AG",
    "Thurgovie":"TG","Thurgau":"TG",
    "Tessin":"TI","Ticino":"TI",
    "Vaud":"VD",
    "Valais":"VS","Wallis":"VS",
    "Neuchâtel":"NE","Neuchatel":"NE",
    "Genève":"GE","Geneve":"GE","Genf":"GE",
    "Jura":"JU",
}

clean_df["canton"] = clean_df["unit"].map(lambda s: CANTON_MAP.get(s, s))
# canton_code if mapped, otherwise None (added later)
REV = {k:v for k,v in CANTON_MAP.items()}
def to_code(name):
    code = REV.get(name, None)  # if the name has a FR code
    if code: return code
    # if 'canton' already has a code (2 letters), keep it
    if isinstance(name, str) and re.fullmatch(r"[A-Z]{2}", name):
        return name
    # if name is a German form already mapped above, nothing to do here
    return CANTON_MAP.get(name, None)

# We want two columns: district (FR label as in the file), district_code (ISO-2)
clean_df["canton_code"] = clean_df["unit"].apply(lambda s: CANTON_MAP.get(s, None))

# Keep only lines with a year and a numerical value
clean_df = clean_df[clean_df["year"].notna()].copy()
clean_df = clean_df.dropna(subset=["motorization_rate_per_1000"])

# Exports
cols_out = ["canton_code","unit","year","motorization_rate_per_1000"]
full_out   = INTER / "motorization_rate_clean_full.csv"
subset_out = INTER / f"motorization_rate_{YEAR_MIN}_{YEAR_MAX}.csv"

# Full
with open(full_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["canton_code","canton","year","motorization_rate_per_1000"])
    for r in clean_df.itertuples(index=False):
        w.writerow([r.canton_code, r.unit, int(r.year), r.motorization_rate_per_1000])

# Window 2015–2024
subset = clean_df[(clean_df["year"]>=YEAR_MIN) & (clean_df["year"]<=YEAR_MAX)].copy()
with open(subset_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["canton_code","canton","year","motorization_rate_per_1000"])
    for r in subset.itertuples(index=False):
        w.writerow([r.canton_code, r.unit, int(r.year), r.motorization_rate_per_1000])

# Diagnostics
print("Saved as:", full_out.name, "|", subset_out.name)
print("Number of canton codes found in the data (not cleaned yet):", clean_df["canton_code"].notna().sum(), "/", clean_df["unit"].nunique())
print("Years (full):", int(clean_df["year"].min()), "→", int(clean_df["year"].max()))
if not subset.empty:
    print("Years (subset):", int(subset["year"].min()), "→", int(subset["year"].max()))
else:
    print("No points in the window", YEAR_MIN, "→", YEAR_MAX, "(it's fine if the file stops before that).")

Saved as: motorization_rate_clean_full.csv | motorization_rate_2015_2024.csv
Number of canton codes found in the data (not cleaned yet): 946 / 26
Years (full): 1970 → 2024
Years (subset): 2015 → 2024


In [17]:
# GDP per capita by Canton (2008–2022) -> we extrapolate for 2023–2025 because of the lack of available data
import pandas as pd, re, csv
from pathlib import Path
import numpy as np

YEAR_MIN, YEAR_MAX = 2015, 2024  
xlsx_path = RAW / "gdp_per_capita.xlsx"
assert xlsx_path.exists(), f"Fichier introuvable: {xlsx_path}"

df = pd.read_excel(xlsx_path, dtype=str)

# Column detection
first_col = df.columns[0]
year_cols = [c for c in df.columns[1:] if re.sub(r"\D", "", str(c)).isdigit()]

# Wide -> Long
df = df[[first_col] + year_cols].rename(columns={first_col: "unit"})
long_df = df.melt(id_vars=["unit"], value_vars=year_cols,
                  var_name="year", value_name="gdp_pc_raw")

# Cleaning
def to_float(x):
    if pd.isna(x): return None
    s = str(x).replace("\xa0", "").replace("\u202f","").replace(" ", "").replace(",", ".")
    s = re.sub(r"[^0-9.\-]", "", s)
    try:
        return float(s) if s else None
    except:
        return None

long_df["year"] = long_df["year"].apply(
    lambda x: int(re.sub(r"\D", "", str(x))) if re.sub(r"\D", "", str(x)) else None
)
long_df["gdp_per_capita_chf"] = long_df["gdp_pc_raw"].apply(to_float)
long_df = long_df.dropna(subset=["year", "gdp_per_capita_chf"])

# Remove the national aggregate "Switzerland"
long_df = long_df[~long_df["unit"].str.match(r"^suisse\b", case=False, na=False)].copy()

# Mapping cantons FR/DE -> codes ISO-2
CANTON_MAP = {
    "Zurich":"ZH","Zürich":"ZH",
    "Berne":"BE","Bern":"BE",
    "Lucerne":"LU","Luzern":"LU",
    "Uri":"UR",
    "Schwyz":"SZ",
    "Obwald":"OW","Obwalden":"OW",
    "Nidwald":"NW","Nidwalden":"NW",
    "Glaris":"GL","Glarus":"GL",
    "Zoug":"ZG","Zug":"ZG",
    "Fribourg":"FR","Freiburg":"FR",
    "Soleure":"SO","Solothurn":"SO",
    "Bâle-Ville":"BS","Basel-Stadt":"BS","Basel Stadt":"BS",
    "Bâle-Campagne":"BL","Basel-Landschaft":"BL","Basel Landschaft":"BL",
    "Schaffhouse":"SH","Schaffhausen":"SH",
    "Appenzell Rhodes-Extérieures":"AR","Appenzell Rhodes Extérieures":"AR","Appenzell Rh.-Ext.":"AR","Appenzell Ausserrhoden":"AR",
    "Appenzell Rhodes-Intérieures":"AI","Appenzell Rhodes Intérieures":"AI","Appenzell Rh.-Int.":"AI","Appenzell Innerrhoden":"AI",
    "Saint-Gall":"SG","St. Gallen":"SG","Sankt Gallen":"SG",
    "Grisons":"GR","Graubünden":"GR","Grigioni":"GR",
    "Argovie":"AG","Aargau":"AG",
    "Thurgovie":"TG","Thurgau":"TG",
    "Tessin":"TI","Ticino":"TI",
    "Vaud":"VD",
    "Valais":"VS","Wallis":"VS",
    "Neuchâtel":"NE","Neuchatel":"NE",
    "Genève":"GE","Geneve":"GE","Genf":"GE",
    "Jura":"JU"
}
long_df["canton"] = long_df["unit"].astype(str).str.strip()
long_df["canton_code"] = long_df["canton"].map(CANTON_MAP)

# Extrapolation for 2023–2025 (CAGR of 3 last years per canton, fallback of 2%)
extrapolated_rows = []
for c, g in long_df.groupby("canton"):
    g = g.sort_values("year")
    # Last 3 observed years that we have
    tail = g.tail(3)
    if len(tail) >= 2:
        # CAGR: (v_t / v_{t-k})**(1/k) - 1 ; if values are negative, fallback
        v0, vt = tail["gdp_per_capita_chf"].iloc[0], tail["gdp_per_capita_chf"].iloc[-1]
        k = (tail["year"].iloc[-1] - tail["year"].iloc[0]) or 1
        if v0 and v0 > 0 and vt and vt > 0:
            mean_growth = (vt / v0) ** (1 / k) - 1.0
        else:
            mean_growth = 0.02
    else:
        mean_growth = 0.02

    last_val = g["gdp_per_capita_chf"].iloc[-1]
    for y in [2023, 2024, 2025]:
        last_val = last_val * (1 + mean_growth)
        extrapolated_rows.append([c, g["canton_code"].iloc[0], y, last_val])

extra_df = pd.DataFrame(
    extrapolated_rows,
    columns=["canton", "canton_code", "year", "gdp_per_capita_chf"]
)

clean_df = pd.concat(
    [long_df[["canton","canton_code","year","gdp_per_capita_chf"]], extra_df],
    ignore_index=True
).sort_values(["canton","year"])

# Exports
full_out   = INTER / "gdp_per_capita_clean_full.csv"
subset_out = INTER / f"gdp_per_capita_{YEAR_MIN}_{YEAR_MAX}.csv"

with open(full_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["canton_code","canton","year","gdp_per_capita_chf"])
    for r in clean_df.itertuples(index=False):
        w.writerow([r.canton_code, r.canton, int(r.year), r.gdp_per_capita_chf])

subset = clean_df[(clean_df["year"]>=YEAR_MIN) & (clean_df["year"]<=YEAR_MAX)]
with open(subset_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["canton_code","canton","year","gdp_per_capita_chf"])
    for r in subset.itertuples(index=False):
        w.writerow([r.canton_code, r.canton, int(r.year), r.gdp_per_capita_chf])

print("Saved as:", full_out.name, "|", subset_out.name)
print("Years going from", int(clean_df["year"].min()), "→", int(clean_df["year"].max()))
print("Cantons:", clean_df["canton"].nunique())


Saved as: gdp_per_capita_clean_full.csv | gdp_per_capita_2015_2024.csv
Years going from 2008 → 2025
Cantons: 26


In [18]:
# Population balance by canton 
import pandas as pd, re, csv
from pathlib import Path

YEAR_MIN, YEAR_MAX = 2015, 2024
xlsx_path = RAW / "Bilan_pop_CH.xlsx"
print("Reading:", xlsx_path.resolve())
assert xlsx_path.exists(), f"Can't find: {xlsx_path}"

def norm_txt(s):
    if pd.isna(s): return ""
    s = str(s).replace("\xa0"," ").strip()
    return re.sub(r"\s+", " ", s)

def to_float(x):
    if pd.isna(x): return None
    s = str(x).replace("\u202f","").replace("\xa0","").replace(" ", "")
    s = s.replace("…","").replace("...","").replace("*","").replace(",", ".")
    s = re.sub(r"[^0-9.\-]", "", s)
    try: return float(s) if s else None
    except: return None

# Canton map (FR/DE/IT -> ISO-2)
CANTON_MAP = {
    "Zurich":"ZH","Zürich":"ZH",
    "Berne":"BE","Bern":"BE",
    "Lucerne":"LU","Luzern":"LU",
    "Uri":"UR",
    "Schwyz":"SZ",
    "Obwald":"OW","Obwalden":"OW","Obwald.":"OW",
    "Nidwald":"NW","Nidwalden":"NW","Nidwald.":"NW",
    "Glaris":"GL","Glarus":"GL",
    "Zoug":"ZG","Zug":"ZG",
    "Fribourg":"FR","Freiburg":"FR",
    "Soleure":"SO","Solothurn":"SO",
    "Bâle-Ville":"BS","Basel-Stadt":"BS","Basel Stadt":"BS",
    "Bâle-Campagne":"BL","Basel-Landschaft":"BL","Basel Landschaft":"BL",
    "Schaffhouse":"SH","Schaffhausen":"SH",
    "Appenzell Rhodes-Extérieures":"AR","Appenzell Rh.-Ext.":"AR","Appenzell Ausserrhoden":"AR",
    "Appenzell Rhodes-Intérieures":"AI","Appenzell Rh.-Int.":"AI","Appenzell Innerrhoden":"AI",
    "Saint-Gall":"SG","St. Gallen":"SG","Sankt Gallen":"SG",
    "Grisons":"GR","Graubünden":"GR","Grigioni":"GR",
    "Argovie":"AG","Aargau":"AG",
    "Thurgovie":"TG","Thurgau":"TG",
    "Tessin":"TI","Ticino":"TI",
    "Vaud":"VD",
    "Valais":"VS","Wallis":"VS",
    "Neuchâtel":"NE","Neuchatel":"NE",
    "Genève":"GE","Geneve":"GE","Genf":"GE",
    "Jura":"JU",
}
CANTON_SET = set(CANTON_MAP.keys())

# Target columns -> regex (merger/variant tolerant)
COL_PATTERNS = {
    "pop_jan1":        re.compile(r"(état|etat).*(1er|premier).*(janvier|jan)\b", re.I),
    "births":          re.compile(r"naissance", re.I),
    "deaths":          re.compile(r"d[ée]c[èe]s|deces", re.I),
    "natural_increase":re.compile(r"accroissement.*naturel", re.I),
    "arrivals":        re.compile(r"arriv", re.I),
    "departures":      re.compile(r"d[ée]part", re.I),
    "net_migration":   re.compile(r"solde.*migr", re.I),
    "pop_dec31":       re.compile(r"(état|etat).*(d[ée]cembre|dec)\b", re.I),
    "variation_abs":   re.compile(r"variation.*nombres.*absolus|variation\s*en\s*nombres", re.I),
    "variation_pct":   re.compile(r"(variation.*%|en\s*%)", re.I),
}

xls = pd.ExcelFile(xlsx_path)
sheet_years = sorted(
    [(sh, int(m.group(1))) for sh in xls.sheet_names
     if (m := re.match(r"^Cantons\s*\((\d{4})\)", sh))],
    key=lambda x: x[1]
)

rows = []

for sheet_name, year in sheet_years:
    df0 = pd.read_excel(xlsx_path, sheet_name=sheet_name, header=None, dtype=str)
    df0 = df0.applymap(norm_txt)

    # Builds a concatenated header with the first 5 lines (merge safe)
    hdr_n = min(5, len(df0))
    head = df0.iloc[:hdr_n, :].fillna("")
    col_labels = []
    for c in range(df0.shape[1]):
        tokens = [t for t in head.iloc[:, c].tolist() if t]
        col_labels.append(" | ".join(tokens))

    # Canton column = the one containing "Canton" in the header, otherwise the first one
    unit_col = 0
    for j, lab in enumerate(col_labels):
        if re.search(r"\bcanton[s]?\b", lab, re.I):
            unit_col = j; break

    # maps numeric columns via patterns
    col_map = {"unit": unit_col}
    for key, rx in COL_PATTERNS.items():
        for j, lab in enumerate(col_labels):
            if rx.search(lab):
                col_map[key] = j
                break

    data = df0.iloc[hdr_n:, :].reset_index(drop=True)

    # keeps only the 26 cantons (exact match on source wording)
    for i in range(len(data)):
        unit = data.iat[i, unit_col] if unit_col < data.shape[1] else ""
        if not unit or unit not in CANTON_SET:
            continue
        canton = unit
        code = CANTON_MAP[canton]

        def getv(k):
            j = col_map.get(k, None)
            if j is None or j >= data.shape[1]: return None
            return to_float(data.iat[i, j])

        rows.append([
            code, canton, year,
            getv("pop_jan1"),
            getv("births"),
            getv("deaths"),
            getv("natural_increase"),
            getv("arrivals"),
            getv("departures"),
            getv("net_migration"),
            getv("pop_dec31"),
            getv("variation_abs"),
            getv("variation_pct"),
        ])

cols = ["canton_code","canton","year","pop_jan1","births","deaths","natural_increase",
        "arrivals","departures","net_migration","pop_dec31","variation_abs","variation_pct"]
out = pd.DataFrame(rows, columns=cols)

# Security here: valid values and years only
measures = cols[3:]
out = out.dropna(subset=measures, how="all")
out = out.dropna(subset=["year"]).copy()
out["year"] = out["year"].astype(int)
out = out.sort_values(["canton","year"]).reset_index(drop=True)

full_out   = INTER / "population_balance_cantons_clean_full.csv"
subset_out = INTER / f"population_balance_cantons_{YEAR_MIN}_{YEAR_MAX}.csv"

with open(full_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(cols)
    for r in out.itertuples(index=False):
        w.writerow(list(r))

subset = out[(out["year"]>=YEAR_MIN) & (out["year"]<=YEAR_MAX)]
with open(subset_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(cols)
    for r in subset.itertuples(index=False):
        w.writerow(list(r))

print("Saved as:", full_out.name, "|", subset_out.name)
print("Parsed sheets:", [y for _, y in sheet_years])
print("Years going from", out["year"].min(), "→", out["year"].max(), "| Cantons:", out["canton"].nunique(), "| Lignes:", len(out))
print("Cover 2015–2024:", subset["year"].min() if not subset.empty else None, "→", subset["year"].max() if not subset.empty else None)

Reading: C:\Users\hamza\OneDrive\Desktop\projet-ada-hk\data\raw\Bilan_pop_CH.xlsx


  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)


Saved as: population_balance_cantons_clean_full.csv | population_balance_cantons_2015_2024.csv
Parsed sheets: [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Years going from 1999 → 2024 | Cantons: 26 | Lignes: 676
Cover 2015–2024: 2015 → 2024


  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)


In [19]:
# Cantonal parliaments: seats by party & sex (extract Greens) 
import pandas as pd, re, csv
from pathlib import Path

YEAR_MIN, YEAR_MAX = 2014, 2025
xlsx_path = RAW / "policy_parties_cantons.xlsx"
print("Reading:", xlsx_path.resolve())
assert xlsx_path.exists(), f"Can't find: {xlsx_path}"

def norm_txt(s):
    if pd.isna(s): return ""
    s = str(s).replace("\xa0"," ").strip()
    return re.sub(r"\s+", " ", s)

def to_int_or_none(x):
    if pd.isna(x): return None
    s = str(x)
    s = s.replace("\u202f","").replace("\xa0","").replace(" ", "")
    s = s.replace("…","").replace("...","").replace("*","").replace(",", ".")
    s = re.sub(r"[^0-9.\-]", "", s)
    if s == "": return None
    try:
        # some tables indicate absence with ".", I have removed it
        return int(float(s))
    except:
        return None

# 26 cantons (FR/DE/IT -> ISO-2)
CANTON_MAP = {
    "Zurich":"ZH","Zürich":"ZH",
    "Berne":"BE","Bern":"BE",
    "Lucerne":"LU","Luzern":"LU",
    "Uri":"UR",
    "Schwyz":"SZ",
    "Obwald":"OW","Obwalden":"OW",
    "Nidwald":"NW","Nidwalden":"NW",
    "Glaris":"GL","Glarus":"GL",
    "Zoug":"ZG","Zug":"ZG",
    "Fribourg":"FR","Freiburg":"FR",
    "Soleure":"SO","Solothurn":"SO",
    "Bâle-Ville":"BS","Basel-Stadt":"BS","Basel Stadt":"BS",
    "Bâle-Campagne":"BL","Basel-Landschaft":"BL","Basel Landschaft":"BL",
    "Schaffhouse":"SH","Schaffhausen":"SH",
    "Appenzell Rh.-Ext.":"AR","Appenzell Ausserrhoden":"AR","Appenzell Rhodes-Extérieures":"AR",
    "Appenzell Rh.-Int.":"AI","Appenzell Innerrhoden":"AI","Appenzell Rhodes-Intérieures":"AI",
    "Saint-Gall":"SG","St. Gallen":"SG","Sankt Gallen":"SG",
    "Grisons":"GR","Graubünden":"GR","Grigioni":"GR",
    "Argovie":"AG","Aargau":"AG",
    "Thurgovie":"TG","Thurgau":"TG",
    "Tessin":"TI","Ticino":"TI",
    "Vaud":"VD",
    "Valais":"VS","Wallis":"VS",
    "Neuchâtel":"NE","Neuchatel":"NE",
    "Genève":"GE","Geneve":"GE","Genf":"GE",
    "Jura":"JU",
}
CANTON_SET = set(CANTON_MAP.keys())

# regex to identify columns
RX_CANTON = re.compile(r"\bcanton[s]?\b", re.I)
RX_YEAR   = re.compile(r"ann[ée]e.*[ée]lectorale|ann[ée]e\s*électorale|année électorale", re.I)
# variations of “Vert-e-s” or "Greens"
RX_GREENS = re.compile(r"\b(vert[\-\s]*e[\-\s]*s|les\s*verts|verts|pes)\b", re.I)
RX_SUB_F  = re.compile(r"\bF\b", re.I)
RX_SUB_H  = re.compile(r"\bH\b", re.I)

xls = pd.ExcelFile(xlsx_path)

def build_col_labels(df, n_header=6):
    n = min(n_header, len(df))
    head = df.iloc[:n, :].fillna("")
    labels = []
    for c in range(df.shape[1]):
        toks = [norm_txt(t) for t in head.iloc[:, c].tolist() if norm_txt(t)]
        labels.append(" | ".join(toks))
    return labels

rows = []

for sheet in xls.sheet_names:
    df0 = pd.read_excel(xlsx_path, sheet_name=sheet, header=None, dtype=str)
    df0 = df0.applymap(norm_txt)
    if df0.empty: 
        continue

    labels = build_col_labels(df0, n_header=6)

    # Find column Cantons and Year
    canton_col, year_col = None, None
    for j, lab in enumerate(labels):
        if canton_col is None and RX_CANTON.search(lab):
            canton_col = j
        if year_col is None and RX_YEAR.search(lab):
            year_col = j
    # If there are no clear clues, we try heuristics: district = column 0
    if canton_col is None: canton_col = 0

    # Colonnes Greens F/H
    greens_F_idx, greens_H_idx = None, None
    all_party_indices = []  # to calculate total seats (sum of all F/M)
    for j, lab in enumerate(labels):
        # identify any F/H subheading column linked to a party
        if RX_SUB_F.search(lab) or RX_SUB_H.search(lab):
            all_party_indices.append(j)
        # F/M marker under Greens
        if RX_GREENS.search(lab) and RX_SUB_F.search(lab):
            greens_F_idx = j
        if RX_GREENS.search(lab) and RX_SUB_H.search(lab):
            greens_H_idx = j

    # data = lines under the header (the raw data has some text and images in the first cells of the excel, that we need to skip)
    start = 6  # skip the 6 merged header lines
    data = df0.iloc[start:, :].reset_index(drop=True)

    for i in range(len(data)):
        canton_label = data.iat[i, canton_col] if canton_col < data.shape[1] else ""
        if not canton_label or canton_label not in CANTON_SET:
            continue
        code = CANTON_MAP[canton_label]

        # year of mandate: if there is a dedicated column, otherwise attempt to extract the first year present on the line
        year_val = None
        if year_col is not None and year_col < data.shape[1]:
            yraw = data.iat[i, year_col]
            ydigits = re.findall(r"\d{4}", yraw) if isinstance(yraw, str) else []
            year_val = int(ydigits[0]) if ydigits else None

        # "greens" seats
        gF = to_int_or_none(data.iat[i, greens_F_idx]) if greens_F_idx is not None and greens_F_idx < data.shape[1] else None
        gH = to_int_or_none(data.iat[i, greens_H_idx]) if greens_H_idx is not None and greens_H_idx < data.shape[1] else None
        greens_seats = (gF or 0) + (gH or 0)

        # total seats (sum of all F/H columns of parties)
        total_seats = 0
        for j in all_party_indices:
            if j < data.shape[1]:
                v = to_int_or_none(data.iat[i, j])
                if v is not None:
                    total_seats += v

        if year_val is None:
            continue  # we only keep it if we have the year

        rows.append([code, canton_label, year_val, greens_seats, total_seats])

# Exports
out = pd.DataFrame(rows, columns=["canton_code","canton","year","greens_seats","total_seats"])
out = out.dropna(subset=["year"]).copy()
out["year"] = out["year"].astype(int)
out = out[(out["year"]>=YEAR_MIN) & (out["year"]<=YEAR_MAX)]
out["greens_share"] = out.apply(lambda r: (r.greens_seats / r.total_seats) if r.total_seats else None, axis=1)
out = out.sort_values(["canton","year"]).reset_index(drop=True)

full_out   = INTER / "cantonal_parliaments_greens_clean_full.csv"
subset_out = INTER / "cantonal_parliaments_greens_2015_2024.csv"  # pour ton panel

# full (2014–2025)
with open(full_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(out.columns)
    for r in out.itertuples(index=False):
        w.writerow(list(r))

# subset (2015–2024)
subset = out[(out["year"]>=2015) & (out["year"]<=2024)]
with open(subset_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(out.columns)
    for r in subset.itertuples(index=False):
        w.writerow(list(r))

print("Saved as:", full_out.name, "|", subset_out.name)
print("Years (full):", out["year"].min() if not out.empty else None, "→", out["year"].max() if not out.empty else None)
print("Cantons:", out["canton"].nunique(), "| Lines:", len(out))


Reading: C:\Users\hamza\OneDrive\Desktop\projet-ada-hk\data\raw\policy_parties_cantons.xlsx


  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)
  df0 = df0.applymap(norm_txt)


Saved as: cantonal_parliaments_greens_clean_full.csv | cantonal_parliaments_greens_2015_2024.csv
Years (full): 2014 → 2025
Cantons: 26 | Lines: 221


  df0 = df0.applymap(norm_txt)


In [20]:
# Canton-level climate & CO2 (static) -> tidy + panel (2015–2024)
import pandas as pd, re, csv

YEAR_MIN, YEAR_MAX = 2015, 2024
xlsx_path = RAW / "canton_climate_co2.xlsx"
print("Lecture:", xlsx_path.resolve())
assert xlsx_path.exists(), f"Introuvable: {xlsx_path}"

def norm_txt(s):
    if pd.isna(s): return ""
    s = str(s).strip().replace("\xa0"," ")
    return re.sub(r"\s+", " ", s)

def to_float(x):
    if pd.isna(x): return None
    s = str(x).replace("\u202f","").replace("\xa0","").replace(" ", "").replace(",", ".")
    s = re.sub(r"[^0-9.\-]", "", s)
    try: return float(s) if s else None
    except: return None

# Mapping canton FR/DE/IT -> code ISO-2
CANTON_MAP = {
    "Zurich":"ZH","Zürich":"ZH",
    "Berne":"BE","Bern":"BE",
    "Lucerne":"LU","Luzern":"LU",
    "Uri":"UR",
    "Schwyz":"SZ",
    "Obwald":"OW","Obwalden":"OW","Obwald.":"OW",
    "Nidwald":"NW","Nidwalden":"NW","Nidwald.":"NW",
    "Glaris":"GL","Glarus":"GL",
    "Zug":"ZG","Zoug":"ZG",
    "Fribourg":"FR","Freiburg":"FR",
    "Soleure":"SO","Solothurn":"SO",
    "Bâle-Ville":"BS","Basel-Stadt":"BS","Basel Stadt":"BS","Basel-Stadt":"BS",
    "Bâle-Campagne":"BL","Basel-Landschaft":"BL","Basel Landschaft":"BL","Basel-Landschaft":"BL",
    "Schaffhouse":"SH","Schaffhausen":"SH",
    "Appenzell Rhodes-Extérieures":"AR","Appenzell Rh.-Ext.":"AR","Appenzell Ausserrhoden":"AR","Appenzell A. Rh. Ext.":"AR",
    "Appenzell Rhodes-Intérieures":"AI","Appenzell Rh.-Int.":"AI","Appenzell Innerrhoden":"AI","Appenzell I. Rh. Int.":"AI",
    "Saint-Gall":"SG","St. Gallen":"SG","Sankt Gallen":"SG","St. Gall":"SG",
    "Grisons":"GR","Graubünden":"GR","Graubuenden":"GR","Grigioni":"GR","Graubünde":"GR",
    "Argovie":"AG","Aargau":"AG",
    "Thurgovie":"TG","Thurgau":"TG",
    "Tessin":"TI","Ticino":"TI",
    "Vaud":"VD",
    "Valais":"VS","Wallis":"VS",
    "Neuchâtel":"NE","Neuchatel":"NE",
    "Genève":"GE","Geneve":"GE","Genf":"GE","Geneva":"GE",
    "Jura":"JU",
}

raw = pd.read_excel(xlsx_path, dtype=str)
raw.columns = [norm_txt(c) for c in raw.columns]

# detect columns by approximate name (in case the headings change slightly)
col_canton = next((c for c in raw.columns if re.search(r"\bcanton\b", c, re.I)), raw.columns[0])
col_summer = next((c for c in raw.columns if re.search(r"summer|été|ete", c, re.I)), None)
col_winter = next((c for c in raw.columns if re.search(r"winter|hiver", c, re.I)), None)
col_co2    = next((c for c in raw.columns if re.search(r"co2|émission|emission", c, re.I)), None)

df = raw[[col_canton, col_summer, col_winter, col_co2]].copy()
df.rename(columns={
    col_canton: "canton",
    col_summer: "summer_temp_c",
    col_winter: "winter_temp_c",
    col_co2:    "co2_emissions_mt"
}, inplace=True)

# cleaning the values
df["canton"] = df["canton"].map(norm_txt)
df["summer_temp_c"] = df["summer_temp_c"].apply(to_float)
df["winter_temp_c"] = df["winter_temp_c"].apply(to_float)
df["co2_emissions_mt"] = df["co2_emissions_mt"].apply(to_float)

# map codes & filter for the 26 cantons
df["canton_code"] = df["canton"].map(CANTON_MAP)
df = df[df["canton_code"].notna()].copy()

# static export (canton level)
static_out = DATA_INTER / "canton_climate_co2_clean.csv"
with open(static_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["canton_code","canton","summer_temp_c","winter_temp_c","co2_emissions_mt"])
    for r in df.itertuples(index=False):
        w.writerow([r.canton_code, r.canton, r.summer_temp_c, r.winter_temp_c, r.co2_emissions_mt])

# Panel 2015–2024 (constant values per canton)
years = pd.DataFrame({"year": list(range(YEAR_MIN, YEAR_MAX+1))})
panel = (df.merge(years, how="cross")[["canton_code","canton","year","summer_temp_c","winter_temp_c","co2_emissions_mt"]]
           .sort_values(["canton","year"]).reset_index(drop=True))

panel_out = DATA_INTER / f"canton_climate_co2_panel_{YEAR_MIN}_{YEAR_MAX}.csv"
with open(panel_out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(panel.columns)
    for r in panel.itertuples(index=False):
        w.writerow(list(r))

print("Saved as", static_out.name, "|", panel_out.name)
print("Cantons:", df['canton'].nunique(), "| Years covered:", YEAR_MIN, "→", YEAR_MAX)


Lecture: C:\Users\hamza\OneDrive\Desktop\projet-ada-hk\data\raw\canton_climate_co2.xlsx
Saved as canton_climate_co2_clean.csv | canton_climate_co2_panel_2015_2024.csv
Cantons: 26 | Years covered: 2015 → 2024
