# Logistische Regression: Beispielvisualisierung

Vergleich eines erfolgreichen und eines gescheiterten Modells fuer die Semesterarbeit.

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Style
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 13
plt.rcParams['axes.labelsize'] = 11

# Paths
DB_PATH = Path('../../data/processed/swiss_votings.db')
OUTPUT_DIR = Path('output/beispiele')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Load data
conn = sqlite3.connect(DB_PATH)

df = pd.read_sql_query("""
    SELECT 
        vr.proposal_id,
        vr.voting_date,
        vr.title_de,
        vr.municipality_id,
        vr.municipality_name,
        vr.ja_prozent,
        mf.sprachgebiete,
        mf.stadt_land_typologie,
        mf.grossregionen_der_schweiz,
        mf.urbanisierungsgrad_degurba_eurostat,
        mf.berggebiete,
        mf.agglomerationsgroessenklasse
    FROM v_voting_results_analysis vr
    INNER JOIN municipality_features_2024 mf ON vr.municipality_id = mf.bfs_nr
    WHERE vr.municipality_id < 9000
      AND vr.proposal_id IN (138, 1)
""", conn)

labels_df = pd.read_sql_query("SELECT * FROM feature_labels_2024", conn)
conn.close()

df['ja_mehrheit'] = (df['ja_prozent'] > 50).astype(int)

# Feature config
feature_cols = ['sprachgebiete', 'stadt_land_typologie', 'grossregionen_der_schweiz',
                'urbanisierungsgrad_degurba_eurostat', 'berggebiete', 'agglomerationsgroessenklasse']

def get_label(feature, code):
    row = labels_df[(labels_df['feature_name'] == feature) & (labels_df['code'] == code)]
    return row['label'].iloc[0] if len(row) > 0 else str(code)

---
## 1. Gescheitertes Modell: Bundesbeschluss Justizreform (2000)

**Grund fuer Scheitern:** 82.4% durchschnittliche Zustimmung → Fast alle Gemeinden stimmten Ja → Keine Varianz im binaeren Outcome.

In [None]:
# Failed model data
df_failed = df[df['proposal_id'] == 1].copy()
title_failed = df_failed['title_de'].iloc[0]
date_failed = df_failed['voting_date'].iloc[0]

print(f"Abstimmung: {date_failed}")
print(f"Titel: {title_failed}")
print(f"\nGemeinden: {len(df_failed)}")
print(f"Durchschnitt Ja: {df_failed['ja_prozent'].mean():.1f}%")
print(f"Min/Max: {df_failed['ja_prozent'].min():.1f}% / {df_failed['ja_prozent'].max():.1f}%")
print(f"\nJa-Mehrheit (>50%): {df_failed['ja_mehrheit'].sum()} ({100*df_failed['ja_mehrheit'].mean():.1f}%)")
print(f"Nein-Mehrheit: {(df_failed['ja_mehrheit']==0).sum()} ({100*(1-df_failed['ja_mehrheit'].mean()):.1f}%)")

In [None]:
# Visualization: Why the model failed
fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# Left: Histogram of ja_prozent
ax1 = axes[0]
sns.histplot(df_failed['ja_prozent'], bins=30, color='#3498db', edgecolor='white', ax=ax1)
ax1.axvline(50, color='#e74c3c', linestyle='--', linewidth=2.5, label='50% Schwelle')
ax1.axvline(df_failed['ja_prozent'].mean(), color='#27ae60', linestyle='-', linewidth=2.5, 
            label=f'Mittelwert: {df_failed["ja_prozent"].mean():.1f}%')
ax1.set_xlabel('Ja-Anteil (%)')
ax1.set_ylabel('Anzahl Gemeinden')
ax1.set_title('Verteilung der Ja-Stimmen', fontweight='bold')
ax1.legend(loc='upper left')
ax1.set_xlim(0, 100)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

# Right: Binary outcome distribution
ax2 = axes[1]
outcome_counts = df_failed['ja_mehrheit'].value_counts().sort_index()
colors = ['#e74c3c', '#27ae60']
bars = ax2.bar(['Nein-Mehrheit\n(<50%)', 'Ja-Mehrheit\n(>50%)'], 
               outcome_counts.values, color=colors, edgecolor='white', width=0.6)

# Add count labels
for bar, count in zip(bars, outcome_counts.values):
    pct = 100 * count / len(df_failed)
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20, 
             f'{count}\n({pct:.1f}%)', ha='center', va='bottom', fontsize=12, fontweight='bold')

ax2.set_ylabel('Anzahl Gemeinden')
ax2.set_title('Binaeres Outcome (Zielvariable)', fontweight='bold')
ax2.set_ylim(0, max(outcome_counts.values) * 1.2)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

plt.suptitle(f'Gescheitertes Modell: {title_failed[:50]}...\n({date_failed})', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'failed_model_justizreform.png', dpi=150, facecolor='white', bbox_inches='tight')
plt.show()

print(f"\n→ Problem: Nur {(df_failed['ja_mehrheit']==0).sum()} von {len(df_failed)} Gemeinden mit Nein-Mehrheit.")
print("→ Logistische Regression benoetigt Varianz in der Zielvariable.")
print("→ Fehler: 'Singular matrix' (perfekte Separation)")

---
## 2. Erfolgreiches Modell: Erbschaftssteuer-Initiative (2015)

**Bestes Modell:** AUC = 0.991, Pseudo R² = 0.50

In [None]:
# Successful model data
df_success = df[df['proposal_id'] == 138].copy()
title_success = df_success['title_de'].iloc[0]
date_success = df_success['voting_date'].iloc[0]

print(f"Abstimmung: {date_success}")
print(f"Titel: {title_success}")
print(f"\nGemeinden: {len(df_success)}")
print(f"Durchschnitt Ja: {df_success['ja_prozent'].mean():.1f}%")
print(f"\nJa-Mehrheit (>50%): {df_success['ja_mehrheit'].sum()} ({100*df_success['ja_mehrheit'].mean():.1f}%)")
print(f"Nein-Mehrheit: {(df_success['ja_mehrheit']==0).sum()} ({100*(1-df_success['ja_mehrheit'].mean()):.1f}%)")

In [None]:
# Fit the logistic regression model
X = pd.get_dummies(df_success[feature_cols], columns=feature_cols, drop_first=True).astype(float)
y = df_success['ja_mehrheit'].astype(float)

X_sm = sm.add_constant(X)
model = sm.Logit(y, X_sm).fit(disp=0)

# Predictions
y_pred_prob = model.predict(X_sm)
y_pred = (y_pred_prob > 0.5).astype(int)

# Metrics
accuracy = accuracy_score(y, y_pred)
fpr, tpr, _ = roc_curve(y, y_pred_prob)
roc_auc = auc(fpr, tpr)

print(f"Modell-Performance:")
print(f"  Accuracy: {accuracy:.1%}")
print(f"  AUC: {roc_auc:.3f}")
print(f"  Pseudo R²: {model.prsquared:.3f}")

In [None]:
# Comprehensive visualization for successful model
fig = plt.figure(figsize=(14, 10))

# Create grid
gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3)

# 1. Histogram of ja_prozent
ax1 = fig.add_subplot(gs[0, 0])
sns.histplot(df_success['ja_prozent'], bins=30, color='#3498db', edgecolor='white', ax=ax1)
ax1.axvline(50, color='#e74c3c', linestyle='--', linewidth=2, label='50% Schwelle')
ax1.axvline(df_success['ja_prozent'].mean(), color='#27ae60', linestyle='-', linewidth=2,
            label=f'Mittelwert: {df_success["ja_prozent"].mean():.1f}%')
ax1.set_xlabel('Ja-Anteil (%)')
ax1.set_ylabel('Anzahl Gemeinden')
ax1.set_title('Verteilung Ja-Stimmen', fontweight='bold')
ax1.legend(fontsize=9)
ax1.set_xlim(0, 100)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

# 2. Binary outcome
ax2 = fig.add_subplot(gs[0, 1])
outcome_counts = df_success['ja_mehrheit'].value_counts().sort_index()
colors = ['#e74c3c', '#27ae60']
bars = ax2.bar(['Nein', 'Ja'], outcome_counts.values, color=colors, edgecolor='white', width=0.5)
for bar, count in zip(bars, outcome_counts.values):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10,
             f'{count}', ha='center', fontsize=11, fontweight='bold')
ax2.set_ylabel('Anzahl Gemeinden')
ax2.set_title('Binaeres Outcome', fontweight='bold')
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

# 3. ROC Curve
ax3 = fig.add_subplot(gs[0, 2])
ax3.plot(fpr, tpr, color='#3498db', linewidth=2.5, label=f'ROC (AUC = {roc_auc:.3f})')
ax3.plot([0, 1], [0, 1], color='#95a5a6', linestyle='--', linewidth=1.5, label='Zufall')
ax3.fill_between(fpr, tpr, alpha=0.2, color='#3498db')
ax3.set_xlabel('False Positive Rate')
ax3.set_ylabel('True Positive Rate')
ax3.set_title('ROC-Kurve', fontweight='bold')
ax3.legend(loc='lower right')
ax3.set_xlim(0, 1)
ax3.set_ylim(0, 1)
ax3.spines['top'].set_visible(False)
ax3.spines['right'].set_visible(False)

# 4. Confusion Matrix
ax4 = fig.add_subplot(gs[1, 0])
cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax4, 
            xticklabels=['Pred: Nein', 'Pred: Ja'],
            yticklabels=['True: Nein', 'True: Ja'],
            annot_kws={'size': 14, 'fontweight': 'bold'})
ax4.set_title('Konfusionsmatrix', fontweight='bold')

# 5. Odds Ratios (significant only)
ax5 = fig.add_subplot(gs[1, 1:])
params = model.params[1:]
conf_int = model.conf_int().iloc[1:]
pvalues = model.pvalues[1:]

odds_ratios = np.exp(params)
ci_lower = np.exp(conf_int[0])
ci_upper = np.exp(conf_int[1])

coef_df = pd.DataFrame({
    'feature': params.index,
    'odds_ratio': odds_ratios.values,
    'ci_lower': ci_lower.values,
    'ci_upper': ci_upper.values,
    'pvalue': pvalues.values,
    'significant': pvalues.values < 0.05
})

# Only significant, sorted
sig_df = coef_df[coef_df['significant']].sort_values('odds_ratio', ascending=True)

if len(sig_df) > 0:
    y_pos = range(len(sig_df))
    colors = ['#27ae60' if or_ > 1 else '#e74c3c' for or_ in sig_df['odds_ratio']]
    
    ax5.barh(y_pos, sig_df['odds_ratio'] - 1, left=1, color=colors, alpha=0.7, height=0.6)
    ax5.errorbar(sig_df['odds_ratio'], y_pos,
                 xerr=[sig_df['odds_ratio'] - sig_df['ci_lower'],
                       sig_df['ci_upper'] - sig_df['odds_ratio']],
                 fmt='none', color='#2c3e50', capsize=3)
    ax5.axvline(1, color='#2c3e50', linestyle='--', linewidth=1.5)
    ax5.set_yticks(y_pos)
    ax5.set_yticklabels(sig_df['feature'], fontsize=9)
    ax5.set_xlabel('Odds Ratio')
    ax5.set_title(f'Signifikante Koeffizienten (p<0.05, n={len(sig_df)})', fontweight='bold')
    ax5.spines['top'].set_visible(False)
    ax5.spines['right'].set_visible(False)

plt.suptitle(f'Erfolgreiches Modell: Erbschaftssteuer-Initiative\n({date_success})',
             fontsize=15, fontweight='bold', y=1.01)
plt.savefig(OUTPUT_DIR / 'successful_model_erbschaftssteuer.png', dpi=150, facecolor='white', bbox_inches='tight')
plt.show()

In [None]:
# Detailed coefficient interpretation
print("Interpretation der signifikanten Koeffizienten:")
print("="*60)
print("(Odds Ratio > 1: hoehere Wahrscheinlichkeit fuer Ja-Mehrheit)")
print("(Odds Ratio < 1: tiefere Wahrscheinlichkeit fuer Ja-Mehrheit)")
print()

for _, row in sig_df.sort_values('odds_ratio', ascending=False).iterrows():
    direction = "↑" if row['odds_ratio'] > 1 else "↓"
    effect = "hoeher" if row['odds_ratio'] > 1 else "tiefer"
    print(f"{direction} {row['feature']}")
    print(f"  OR = {row['odds_ratio']:.2f} [{row['ci_lower']:.2f}, {row['ci_upper']:.2f}]")
    print(f"  p = {row['pvalue']:.4f}")
    print()

---
## 3. Direkter Vergleich

In [None]:
# Side-by-side comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Failed model
ax1 = axes[0]
outcome1 = df_failed['ja_mehrheit'].value_counts().sort_index()
colors1 = ['#e74c3c', '#27ae60']
wedges1, texts1, autotexts1 = ax1.pie(outcome1.values, labels=['Nein', 'Ja'], colors=colors1,
                                       autopct='%1.1f%%', startangle=90,
                                       explode=[0.05, 0], textprops={'fontsize': 12})
ax1.set_title(f'Justizreform 2000\n(Modell gescheitert)\n\nMittelwert: {df_failed["ja_prozent"].mean():.1f}%',
              fontsize=12, fontweight='bold')

# Successful model
ax2 = axes[1]
outcome2 = df_success['ja_mehrheit'].value_counts().sort_index()
wedges2, texts2, autotexts2 = ax2.pie(outcome2.values, labels=['Nein', 'Ja'], colors=colors1,
                                       autopct='%1.1f%%', startangle=90,
                                       explode=[0, 0.05], textprops={'fontsize': 12})
ax2.set_title(f'Erbschaftssteuer 2015\n(AUC = 0.991)\n\nMittelwert: {df_success["ja_prozent"].mean():.1f}%',
              fontsize=12, fontweight='bold')

plt.suptitle('Vergleich: Varianz in der Zielvariable', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'comparison_variance.png', dpi=150, facecolor='white', bbox_inches='tight')
plt.show()

print("Fazit:")
print(f"  - Justizreform: {outcome1[0]} Nein vs. {outcome1[1]} Ja → Zu wenig Varianz")
print(f"  - Erbschaftssteuer: {outcome2[0]} Nein vs. {outcome2[1]} Ja → Genuegend Varianz")

In [None]:
print("="*60)
print("GESPEICHERTE DATEIEN")
print("="*60)
for f in OUTPUT_DIR.glob('*.png'):
    print(f"  - {f}")