## Baseline Models for Unemployment Prediction

To benchmark the performance of our custom simulation-based pipeline, we first evaluate several standard supervised models trained directly on auxiliary features (e.g., voting data) to predict unemployment rates.

The baseline models include:
- **Decision Tree**
- **Linear Regression**
- **Random Forest Regressor**
- **XGBoost**

These models are trained using the real district-level unemployment rates (`Real_Unemployment`) as targets and do not rely on clustering or simulated labels.

This baseline setup represents a fully supervised learning scenario and allows us to assess whether our method adds value when direct fine-grained supervision is not feasible or available.


In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# === Load Data ===
df_voting = pd.read_excel("./DATA/combined_with_kraj_okres.xlsx")
df_real_okres = pd.read_csv("./DATA/Formatted_Okres_Data.csv")
df_kraj_unemployment = pd.read_csv("./DATA/nezamestnanost.csv")

# === Normalize names ===
for df in [df_voting, df_real_okres, df_kraj_unemployment]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"]
            .str.lower()
            .str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"]
            .str.lower()
            .str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# === Clean voting data ===
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."]
    .astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# === Pivot to wide format ===
df_voting_pivot = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="VolebnÃ­ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# === Aggregate KRAJ-level data ===
df_kraj_voting = df_voting.groupby(["Kraj", "VolebnÃ­ strana"])["Hlasy abs."].sum().reset_index()
df_kraj_wide = df_kraj_voting.pivot_table(
    index="Kraj",
    columns="VolebnÃ­ strana",
    values="Hlasy abs.",
    fill_value=0
).reset_index()

df_kraj_merged = pd.merge(df_kraj_wide, df_kraj_unemployment, on="Kraj", how="left")
df_kraj_merged.rename(columns={"PodÃ­l nezamÄ›stnanÃ½ch osob [%]": "Kraj_Unemployment"}, inplace=True)

# === Prepare training data ===
X_kraj = df_kraj_merged.drop(columns=["Kraj", "Kraj_Unemployment"])
y_kraj = df_kraj_merged["Kraj_Unemployment"]

# Scale features
scaler = StandardScaler()
X_kraj_scaled = scaler.fit_transform(X_kraj)

# === Train model ===
model = DecisionTreeRegressor(random_state=42)
model.fit(X_kraj_scaled, y_kraj)

# === Prepare OKRES data ===
X_okres = df_voting_pivot.drop(columns=["Kraj", "Okres"])

# Align features
missing_cols = set(X_kraj.columns) - set(X_okres.columns)
for col in missing_cols:
    X_okres[col] = 0
X_okres = X_okres[X_kraj.columns]

X_okres_scaled = scaler.transform(X_okres)
df_voting_pivot["Predicted_Unemployment"] = model.predict(X_okres_scaled)

# === Merge with real OKRES unemployment for evaluation ===
df_eval = pd.merge(
    df_voting_pivot,
    df_real_okres[["Okres", "PodÃ­l nezamÄ›stnanÃ½ch osob [%]"]],
    on="Okres",
    how="left"
)
df_eval.rename(columns={"PodÃ­l nezamÄ›stnanÃ½ch osob [%]": "Real_Unemployment"}, inplace=True)

# === Compute MAE ===
df_eval_clean = df_eval.dropna(subset=["Real_Unemployment"])
mae = mean_absolute_error(df_eval_clean["Real_Unemployment"], df_eval_clean["Predicted_Unemployment"])
print(f"ðŸŒ³ Decision Tree MAE (trained on Kraj, predicted on Okres): {mae:.4f}")

# === Save results ===
df_output = df_eval_clean[["Okres", "Real_Unemployment", "Predicted_Unemployment"]]
# df_output.to_csv("baseline_dt_kraj_to_okres_predictions.csv", index=False)
# print("âœ… Results saved to 'baseline_dt_kraj_to_okres_predictions.csv'")


ðŸŒ³ Decision Tree MAE (trained on Kraj, predicted on Okres): 1.1307
âœ… Results saved to 'baseline_dt_kraj_to_okres_predictions.csv'


In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load data
voting_file = "./DATA/combined_with_kraj_okres.xlsx"
real_okres_file = "./DATA/Formatted_Okres_Data.csv"
kraj_unemployment_file = "./DATA/nezamestnanost.csv"

# Read files
df_voting = pd.read_excel(voting_file)
df_real_okres = pd.read_csv(real_okres_file)
df_kraj_unemployment = pd.read_csv(kraj_unemployment_file)

# Normalize names
for df in [df_voting, df_real_okres, df_kraj_unemployment]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# Clean voting numbers
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."].astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# Pivot voting data at Kraj level for training
df_kraj_votes = df_voting.pivot_table(
    index="Kraj",
    columns="VolebnÃ­ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# Merge with Kraj-level unemployment
df_kraj_train = pd.merge(df_kraj_votes, df_kraj_unemployment, on="Kraj", how="inner")
df_kraj_train.rename(columns={"PodÃ­l nezamÄ›stnanÃ½ch osob [%]": "Kraj_Unemployment"}, inplace=True)

# Prepare features and target for training
X_kraj = df_kraj_train.drop(columns=["Kraj", "Kraj_Unemployment"])
y_kraj = df_kraj_train["Kraj_Unemployment"]



scaler = StandardScaler()

X_kraj_scaled = scaler.fit_transform(X_kraj)

# Train model
model = LinearRegression()
model.fit(X_kraj_scaled, y_kraj)

# Prepare Okres-level features for prediction
df_okres_votes = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="VolebnÃ­ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

X_okres = df_okres_votes.drop(columns=["Kraj", "Okres"])
# ðŸ”§ Ensure Okres-level data has same features as Kraj-level training
missing_cols = set(X_kraj.columns) - set(X_okres.columns)
for col in missing_cols:
    X_okres[col] = 0
X_okres = X_okres[X_kraj.columns]

X_okres_scaled = scaler.transform(X_okres)

# Predict for Okres-level
df_okres_votes["LR_Predicted"] = model.predict(X_okres_scaled)

# Merge with real unemployment for evaluation
df_eval = pd.merge(df_okres_votes, df_real_okres, on="Okres", how="left")
df_eval.rename(columns={"PodÃ­l nezamÄ›stnanÃ½ch osob [%]": "Real_Unemployment"}, inplace=True)

# Drop rows without real data
df_eval_clean = df_eval.dropna(subset=["Real_Unemployment"])

# Compute MAE
mae = mean_absolute_error(df_eval_clean["Real_Unemployment"], df_eval_clean["LR_Predicted"])
print(f"ðŸ“‰ MAE (Kraj-trained Linear Regression â†’ Okres prediction): {mae:.4f}")

# # Save output
# df_output = df_eval_clean[["Okres", "Real_Unemployment", "LR_Predicted"]]
# df_output.to_csv("baseline_lr_kraj_to_okres.csv", index=False)
# print("âœ… Results saved to 'baseline_lr_kraj_to_okres.csv'")

ðŸ“‰ MAE (Kraj-trained Linear Regression â†’ Okres prediction): 0.8497


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load data
voting_file = "./DATA/combined_with_kraj_okres.xlsx"
unemployment_file = "./DATA/nezamestnanost.csv"
real_okres_file = "./DATA/Formatted_Okres_Data.csv"

# Read files
df_voting = pd.read_excel(voting_file)
df_real = pd.read_csv(real_okres_file)
df_kraj = pd.read_csv(unemployment_file)

# Normalize names
for df in [df_voting, df_real, df_kraj]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# Clean voting numbers
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."].astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# Pivot voting data
df_voting_pivot = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="VolebnÃ­ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# Merge Kraj unemployment rate to each row
df_voting_pivot = pd.merge(df_voting_pivot, df_kraj, on="Kraj", how="left")
df_voting_pivot.rename(columns={"PodÃ­l nezamÄ›stnanÃ½ch osob [%]": "Kraj_Unemployment"}, inplace=True)

# Prepare features
exclude_columns = ["Kraj", "Okres", "Kraj_Unemployment"]
X = df_voting_pivot.drop(columns=exclude_columns)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = df_voting_pivot["Kraj_Unemployment"]

# Train Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_scaled, y)

# Predict for Okres level
df_voting_pivot["Predicted_Unemployment"] = model.predict(X_scaled)

# Merge with real unemployment for evaluation
df_eval = pd.merge(
    df_voting_pivot,
    df_real[["Okres", "PodÃ­l nezamÄ›stnanÃ½ch osob [%]"]],
    on="Okres",
    how="left"
)
df_eval.rename(columns={"PodÃ­l nezamÄ›stnanÃ½ch osob [%]": "Real_Unemployment"}, inplace=True)

# Drop NAs
df_eval_clean = df_eval.dropna(subset=["Real_Unemployment"])

# MAE
mae = mean_absolute_error(df_eval_clean["Real_Unemployment"], df_eval_clean["Predicted_Unemployment"])
print(f"ðŸŒ² Random Forest MAE (Kraj â†’ Okres): {mae:.4f}")

# # Save results
# df_output = df_eval_clean[["Okres", "Real_Unemployment", "Predicted_Unemployment"]]
# df_output.to_csv("baseline_rf_kraj_to_okres.csv", index=False)
# print("âœ… Results saved to 'baseline_rf_kraj_to_okres.csv'")

ðŸŒ² Random Forest MAE (Kraj â†’ Okres): 0.7186


In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# === Load Data ===
df_voting = pd.read_excel("./DATA/combined_with_kraj_okres.xlsx")
df_real_okres = pd.read_csv("./DATA/Formatted_Okres_Data.csv")
df_kraj_unemployment = pd.read_csv("./DATA/nezamestnanost.csv")

# === Normalize names ===
for df in [df_voting, df_real_okres, df_kraj_unemployment]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"]
            .str.lower()
            .str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"]
            .str.lower()
            .str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# === Clean voting data ===
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."]
    .astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# === Pivot to wide format ===
df_voting_pivot = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="VolebnÃ­ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# === Aggregate KRAJ-level data ===
df_kraj_voting = df_voting.groupby(["Kraj", "VolebnÃ­ strana"])["Hlasy abs."].sum().reset_index()
df_kraj_wide = df_kraj_voting.pivot_table(
    index="Kraj",
    columns="VolebnÃ­ strana",
    values="Hlasy abs.",
    fill_value=0
).reset_index()

df_kraj_merged = pd.merge(df_kraj_wide, df_kraj_unemployment, on="Kraj", how="left")
df_kraj_merged.rename(columns={"PodÃ­l nezamÄ›stnanÃ½ch osob [%]": "Kraj_Unemployment"}, inplace=True)

# === Prepare training data ===
X_kraj = df_kraj_merged.drop(columns=["Kraj", "Kraj_Unemployment"])
y_kraj = df_kraj_merged["Kraj_Unemployment"]

# Scale features
scaler = StandardScaler()
X_kraj_scaled = scaler.fit_transform(X_kraj)

# === Train XGBoost model ===
model = XGBRegressor(random_state=42)
model.fit(X_kraj_scaled, y_kraj)

# === Prepare OKRES data ===
X_okres = df_voting_pivot.drop(columns=["Kraj", "Okres"])

# Align features
missing_cols = set(X_kraj.columns) - set(X_okres.columns)
for col in missing_cols:
    X_okres[col] = 0
X_okres = X_okres[X_kraj.columns]

X_okres_scaled = scaler.transform(X_okres)
df_voting_pivot["Predicted_Unemployment"] = model.predict(X_okres_scaled)

# === Merge with real OKRES unemployment for evaluation ===
df_eval = pd.merge(
    df_voting_pivot,
    df_real_okres[["Okres", "PodÃ­l nezamÄ›stnanÃ½ch osob [%]"]],
    on="Okres",
    how="left"
)
df_eval.rename(columns={"PodÃ­l nezamÄ›stnanÃ½ch osob [%]": "Real_Unemployment"}, inplace=True)

# === Compute MAE ===
df_eval_clean = df_eval.dropna(subset=["Real_Unemployment"])
mae = mean_absolute_error(df_eval_clean["Real_Unemployment"], df_eval_clean["Predicted_Unemployment"])
print(f"âš¡ XGBoost MAE (trained on Kraj, predicted on Okres): {mae:.4f}")

# # === Save results ===
# df_output = df_eval_clean[["Okres", "Real_Unemployment", "Predicted_Unemployment"]]
# df_output.to_csv("baseline_xgb_kraj_to_okres_predictions.csv", index=False)
# print("âœ… Results saved to 'baseline_xgb_kraj_to_okres_predictions.csv'")

âš¡ XGBoost MAE (trained on Kraj, predicted on Okres): 1.2393
