## Baseline Models for Unemployment Prediction

To benchmark the performance of our custom simulation-based pipeline, we first evaluate several standard supervised models trained directly on auxiliary features (e.g., voting data) to predict unemployment rates.

The baseline models include:
- **Decision Tree**
- **Linear Regression**
- **Random Forest Regressor**
- **XGBoost**

These models are trained using the real district-level unemployment rates (`Real_Unemployment`) as targets and do not rely on clustering or simulated labels.

This baseline setup represents a fully supervised learning scenario and allows us to assess whether our method adds value when direct fine-grained supervision is not feasible or available.


In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# === Load Data ===
df_voting = pd.read_excel("./DATA/combined_with_kraj_okres.xlsx")
df_real_okres = pd.read_csv("./DATA/Formatted_Okres_Data.csv")
df_kraj_unemployment = pd.read_csv("./DATA/nezamestnanost.csv")

# === Normalize names ===
for df in [df_voting, df_real_okres, df_kraj_unemployment]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"]
            .str.lower()
            .str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"]
            .str.lower()
            .str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# === Clean voting data ===
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."]
    .astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# === Pivot to wide format ===
df_voting_pivot = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="Volebn√≠ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# === Aggregate KRAJ-level data ===
df_kraj_voting = df_voting.groupby(["Kraj", "Volebn√≠ strana"])["Hlasy abs."].sum().reset_index()
df_kraj_wide = df_kraj_voting.pivot_table(
    index="Kraj",
    columns="Volebn√≠ strana",
    values="Hlasy abs.",
    fill_value=0
).reset_index()

df_kraj_merged = pd.merge(df_kraj_wide, df_kraj_unemployment, on="Kraj", how="left")
df_kraj_merged.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Kraj_Unemployment"}, inplace=True)

# === Prepare training data ===
X_kraj = df_kraj_merged.drop(columns=["Kraj", "Kraj_Unemployment"])
y_kraj = df_kraj_merged["Kraj_Unemployment"]

# Scale features
scaler = StandardScaler()
X_kraj_scaled = scaler.fit_transform(X_kraj)

# === Train model ===
model = DecisionTreeRegressor(random_state=42)
model.fit(X_kraj_scaled, y_kraj)

# === Prepare OKRES data ===
X_okres = df_voting_pivot.drop(columns=["Kraj", "Okres"])

# Align features
missing_cols = set(X_kraj.columns) - set(X_okres.columns)
for col in missing_cols:
    X_okres[col] = 0
X_okres = X_okres[X_kraj.columns]

X_okres_scaled = scaler.transform(X_okres)
df_voting_pivot["Predicted_Unemployment"] = model.predict(X_okres_scaled)

# === Merge with real OKRES unemployment for evaluation ===
df_eval = pd.merge(
    df_voting_pivot,
    df_real_okres[["Okres", "Pod√≠l nezamƒõstnan√Ωch osob [%]"]],
    on="Okres",
    how="left"
)
df_eval.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Real_Unemployment"}, inplace=True)

# === Compute MAE ===
df_eval_clean = df_eval.dropna(subset=["Real_Unemployment"])
mae = mean_absolute_error(df_eval_clean["Real_Unemployment"], df_eval_clean["Predicted_Unemployment"])
print(f"üå≥ Decision Tree MAE (trained on Kraj, predicted on Okres): {mae:.4f}")

# === Save results ===
df_output = df_eval_clean[["Okres", "Real_Unemployment", "Predicted_Unemployment"]]
# df_output.to_csv("baseline_dt_kraj_to_okres_predictions.csv", index=False)
# print("‚úÖ Results saved to 'baseline_dt_kraj_to_okres_predictions.csv'")


üå≥ Decision Tree MAE (trained on Kraj, predicted on Okres): 1.1307


In [12]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load data
voting_file = "./DATA/combined_with_kraj_okres.xlsx"
real_okres_file = "./DATA/Formatted_Okres_Data.csv"
kraj_unemployment_file = "./DATA/nezamestnanost.csv"

# Read files
df_voting = pd.read_excel(voting_file)
df_real_okres = pd.read_csv(real_okres_file)
df_kraj_unemployment = pd.read_csv(kraj_unemployment_file)

# Normalize names
for df in [df_voting, df_real_okres, df_kraj_unemployment]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# Clean voting numbers
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."].astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# Pivot voting data at Kraj level for training
df_kraj_votes = df_voting.pivot_table(
    index="Kraj",
    columns="Volebn√≠ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# Merge with Kraj-level unemployment
df_kraj_train = pd.merge(df_kraj_votes, df_kraj_unemployment, on="Kraj", how="inner")
df_kraj_train.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Kraj_Unemployment"}, inplace=True)

# Prepare features and target for training
X_kraj = df_kraj_train.drop(columns=["Kraj", "Kraj_Unemployment"])
y_kraj = df_kraj_train["Kraj_Unemployment"]



scaler = StandardScaler()

X_kraj_scaled = scaler.fit_transform(X_kraj)

# Train model
model = LinearRegression()
model.fit(X_kraj_scaled, y_kraj)

# Prepare Okres-level features for prediction
df_okres_votes = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="Volebn√≠ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

X_okres = df_okres_votes.drop(columns=["Kraj", "Okres"])
# üîß Ensure Okres-level data has same features as Kraj-level training
missing_cols = set(X_kraj.columns) - set(X_okres.columns)
for col in missing_cols:
    X_okres[col] = 0
X_okres = X_okres[X_kraj.columns]

X_okres_scaled = scaler.transform(X_okres)

# Predict for Okres-level
df_okres_votes["LR_Predicted"] = model.predict(X_okres_scaled)

# Merge with real unemployment for evaluation
df_eval = pd.merge(df_okres_votes, df_real_okres, on="Okres", how="left")
df_eval.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Real_Unemployment"}, inplace=True)

# Drop rows without real data
df_eval_clean = df_eval.dropna(subset=["Real_Unemployment"])

# Compute MAE
mae = mean_absolute_error(df_eval_clean["Real_Unemployment"], df_eval_clean["LR_Predicted"])
print(f"üìâ MAE (Kraj-trained Linear Regression ‚Üí Okres prediction): {mae:.4f}")

# # Save output
# df_output = df_eval_clean[["Okres", "Real_Unemployment", "LR_Predicted"]]
# df_output.to_csv("baseline_lr_kraj_to_okres.csv", index=False)
# print("‚úÖ Results saved to 'baseline_lr_kraj_to_okres.csv'")

üìâ MAE (Kraj-trained Linear Regression ‚Üí Okres prediction): 0.8497


In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load data
voting_file = "./DATA/combined_with_kraj_okres.xlsx"
unemployment_file = "./DATA/nezamestnanost.csv"
real_okres_file = "./DATA/Formatted_Okres_Data.csv"

# Read files
df_voting = pd.read_excel(voting_file)
df_real = pd.read_csv(real_okres_file)
df_kraj = pd.read_csv(unemployment_file)

# Normalize names
for df in [df_voting, df_real, df_kraj]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# Clean voting numbers
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."].astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# Pivot voting data
df_voting_pivot = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="Volebn√≠ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# Merge Kraj unemployment rate to each row
df_voting_pivot = pd.merge(df_voting_pivot, df_kraj, on="Kraj", how="left")
df_voting_pivot.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Kraj_Unemployment"}, inplace=True)

# Prepare features
exclude_columns = ["Kraj", "Okres", "Kraj_Unemployment"]
X = df_voting_pivot.drop(columns=exclude_columns)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = df_voting_pivot["Kraj_Unemployment"]

# Train Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_scaled, y)

# Predict for Okres level
df_voting_pivot["Predicted_Unemployment"] = model.predict(X_scaled)

# Merge with real unemployment for evaluation
df_eval = pd.merge(
    df_voting_pivot,
    df_real[["Okres", "Pod√≠l nezamƒõstnan√Ωch osob [%]"]],
    on="Okres",
    how="left"
)
df_eval.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Real_Unemployment"}, inplace=True)

# Drop NAs
df_eval_clean = df_eval.dropna(subset=["Real_Unemployment"])

# MAE
mae = mean_absolute_error(df_eval_clean["Real_Unemployment"], df_eval_clean["Predicted_Unemployment"])
print(f"üå≤ Random Forest MAE (Kraj ‚Üí Okres): {mae:.4f}")

# # Save results
# df_output = df_eval_clean[["Okres", "Real_Unemployment", "Predicted_Unemployment"]]
# df_output.to_csv("baseline_rf_kraj_to_okres.csv", index=False)
# print("‚úÖ Results saved to 'baseline_rf_kraj_to_okres.csv'")

üå≤ Random Forest MAE (Kraj ‚Üí Okres): 0.7186


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

# Load data
voting_file = "./DATA/combined_with_kraj_okres.xlsx"
unemployment_file = "./DATA/nezamestnanost.csv"
real_okres_file = "./DATA/Formatted_Okres_Data.csv"

# Read files
df_voting = pd.read_excel(voting_file)
df_real = pd.read_csv(real_okres_file)
df_kraj = pd.read_csv(unemployment_file)

# Normalize names
for df in [df_voting, df_real, df_kraj]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"].str.lower().str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# Clean voting numbers
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."].astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# Pivot voting data
df_voting_pivot = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="Volebn√≠ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# Merge Kraj unemployment rate to each row
df_voting_pivot = pd.merge(df_voting_pivot, df_kraj, on="Kraj", how="left")
df_voting_pivot.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Kraj_Unemployment"}, inplace=True)

# Prepare features
exclude_columns = ["Kraj", "Okres", "Kraj_Unemployment"]
X = df_voting_pivot.drop(columns=exclude_columns)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = df_voting_pivot["Kraj_Unemployment"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Create Gradient Boosting model (instead of XGBoost)
model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Predict for full dataset
df_voting_pivot["Predicted_Unemployment"] = model.predict(X_scaled)

# Merge with real unemployment for evaluation
df_eval = pd.merge(
    df_voting_pivot,
    df_real[["Okres", "Pod√≠l nezamƒõstnan√Ωch osob [%]"]],
    on="Okres",
    how="left"
)
df_eval.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Real_Unemployment"}, inplace=True)

# Drop NAs
df_eval_clean = df_eval.dropna(subset=["Real_Unemployment"])

# MAE
mae = mean_absolute_error(
    df_eval_clean["Real_Unemployment"],
    df_eval_clean["Predicted_Unemployment"]
)
print(f"üå≤ Gradient Boosting MAE (Kraj ‚Üí Okres): {mae:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance.head(10))

# Optional: Save results
# df_output = df_eval_clean[["Okres", "Real_Unemployment", "Predicted_Unemployment"]]
# df_output.to_csv("baseline_gradientboosting_kraj_to_okres.csv", index=False)
# print("‚úÖ Results saved to 'baseline_gradientboosting_kraj_to_okres.csv'")


üå≤ Gradient Boosting MAE (Kraj ‚Üí Okres): 0.7109

Top 10 Important Features:
                                               feature  importance
391                            Poƒçet uchazeƒç≈Ø na 1 VPM    0.956628
31                                             KDU-ƒåSL    0.011775
301                                  Sdru≈æen√≠ STAN, NK    0.007241
269                                Sdru≈æen√≠ Pir√°ti, NK    0.003770
220                               Sdru≈æen√≠ KDU-ƒåSL, NK    0.002707
343  Sdru≈æen√≠ nez√°visl√Ωch kandid√°t≈Ø - m√≠stn√≠ sdru≈æe...    0.002500
318                                Sdru≈æen√≠ TOP 09, NK    0.001940
192                             STAROSTOV√â A NEZ√ÅVISL√ç    0.001870
247                                   Sdru≈æen√≠ NEZ, NK    0.001678
287                                Sdru≈æen√≠ SOCDEM, NK    0.001438


In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# === Load Data ===
df_voting = pd.read_excel("./DATA/combined_with_kraj_okres.xlsx")
df_real_okres = pd.read_csv("./DATA/Formatted_Okres_Data.csv")
df_kraj_unemployment = pd.read_csv("./DATA/nezamestnanost.csv")

# === Normalize names ===
for df in [df_voting, df_real_okres, df_kraj_unemployment]:
    if "Okres" in df.columns:
        df["Okres"] = (
            df["Okres"]
            .str.lower()
            .str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )
    if "Kraj" in df.columns:
        df["Kraj"] = (
            df["Kraj"]
            .str.lower()
            .str.replace(" ", "")
            .str.normalize("NFKD").str.encode("ascii", errors="ignore").str.decode("utf-8")
        )

# === Clean voting data ===
df_voting["Hlasy abs."] = (
    df_voting["Hlasy abs."]
    .astype(str)
    .str.replace("\xa0", "", regex=True)
    .str.replace(",", ".", regex=True)
    .astype(float)
)

# === Pivot voting data at KRAJ level for training ===
df_kraj_votes = df_voting.pivot_table(
    index="Kraj",
    columns="Volebn√≠ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

# Merge with Kraj-level unemployment
df_kraj_train = pd.merge(df_kraj_votes, df_kraj_unemployment, on="Kraj", how="inner")
df_kraj_train.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Kraj_Unemployment"}, inplace=True)

# Prepare training features and target
X_kraj = df_kraj_train.drop(columns=["Kraj", "Kraj_Unemployment"])
y_kraj = df_kraj_train["Kraj_Unemployment"]

# Scale features
scaler = StandardScaler()
X_kraj_scaled = scaler.fit_transform(X_kraj)

# === Prepare OKRES-level data for prediction ===
df_okres_votes = df_voting.pivot_table(
    index=["Kraj", "Okres"],
    columns="Volebn√≠ strana",
    values="Hlasy abs.",
    aggfunc="sum",
    fill_value=0
).reset_index()

X_okres = df_okres_votes.drop(columns=["Kraj", "Okres"])

# Align features with training data
missing_cols = set(X_kraj.columns) - set(X_okres.columns)
for col in missing_cols:
    X_okres[col] = 0
X_okres = X_okres[X_kraj.columns]

X_okres_scaled = scaler.transform(X_okres)

# === Initialize results dataframe ===
results = df_okres_votes[["Okres"]].copy()

# Merge with actual unemployment values
results = pd.merge(
    results,
    df_real_okres[["Okres", "Pod√≠l nezamƒõstnan√Ωch osob [%]"]],
    on="Okres",
    how="left"
)
results.rename(columns={"Pod√≠l nezamƒõstnan√Ωch osob [%]": "Actual_Unemployment"}, inplace=True)

# === Define all 4 models ===
models = {
    "Decision_Tree": DecisionTreeRegressor(random_state=42),
    "Linear_Regression": LinearRegression(),
    "Random_Forest": RandomForestRegressor(random_state=42),
    "Gradient_Boosting": GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )
}

# === Train and predict with each model ===
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_kraj_scaled, y_kraj)
    predictions = model.predict(X_okres_scaled)
    results[f"Predicted_{model_name}"] = predictions

# === Calculate absolute errors for each model ===
for model_name in models.keys():
    results[f"AbsError_{model_name}"] = abs(
        results["Actual_Unemployment"] - results[f"Predicted_{model_name}"]
    )

# === Save to CSV ===
output_file = "okres_unemployment_predictions_all_models.csv"
results.to_csv(output_file, index=False)
print(f"\n‚úÖ Results saved to {output_file}")
print(f"Total okresy: {len(results)}")
print(f"\nColumns: {list(results.columns)}")

# === Display summary statistics ===
print("\n" + "="*60)
print("SUMMARY - Mean Absolute Error by Model:")
print("="*60)
for model_name in models.keys():
    valid_results = results.dropna(subset=["Actual_Unemployment", f"Predicted_{model_name}"])
    mae = valid_results[f"AbsError_{model_name}"].mean()
    print(f"{model_name:20s}: {mae:.4f}")

print("\n" + "="*60)
print("Preview of results:")
print("="*60)
print(results.head(10).to_string())

Training Decision_Tree...
Training Linear_Regression...
Training Random_Forest...
Training Gradient_Boosting...

‚úÖ Results saved to okres_unemployment_predictions_all_models.csv
Total okresy: 77

Columns: ['Okres', 'Actual_Unemployment', 'Predicted_Decision_Tree', 'Predicted_Linear_Regression', 'Predicted_Random_Forest', 'Predicted_Gradient_Boosting', 'AbsError_Decision_Tree', 'AbsError_Linear_Regression', 'AbsError_Random_Forest', 'AbsError_Gradient_Boosting']

SUMMARY - Mean Absolute Error by Model:
Decision_Tree       : 1.1307
Linear_Regression   : 0.8497
Random_Forest       : 0.8924
Gradient_Boosting   : 1.0435

Preview of results:
              Okres  Actual_Unemployment  Predicted_Decision_Tree  Predicted_Linear_Regression  Predicted_Random_Forest  Predicted_Gradient_Boosting  AbsError_Decision_Tree  AbsError_Linear_Regression  AbsError_Random_Forest  AbsError_Gradient_Boosting
0  hlavnimestopraha                  3.1                      3.1                     3.088383       