# Logistische Regression: Abstimmungsverhalten

Analyse des Abstimmungsverhaltens (Ja/Nein) basierend auf Gemeindemerkmalen.

**Outcome:** Binaer - Hat die Gemeinde mehrheitlich Ja gestimmt? (ja_prozent > 50)

**Praediktoren:** Kategoriale Gemeindemerkmale (Sprachgebiet, Stadt/Land, etc.)

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import statsmodels.api as sm
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Paths
DB_PATH = Path('../../data/processed/swiss_votings.db')
OUTPUT_DIR = Path('output')
OUTPUT_DIR.mkdir(exist_ok=True)

# Seaborn style
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11

print(f"Database: {DB_PATH}")
print(f"Output: {OUTPUT_DIR}")

## 1. Daten laden und vorbereiten

In [None]:
conn = sqlite3.connect(DB_PATH)

# Load voting results with features (only municipalities with matching features)
df = pd.read_sql_query("""
    SELECT 
        vr.proposal_id,
        vr.voting_date,
        vr.title_de,
        vr.municipality_id,
        vr.municipality_name,
        vr.ja_prozent,
        vr.angenommen,
        mf.sprachgebiete,
        mf.stadt_land_typologie,
        mf.grossregionen_der_schweiz,
        mf.urbanisierungsgrad_degurba_eurostat,
        mf.berggebiete,
        mf.agglomerationsgroessenklasse,
        mf.gemeindetypologie_9_typen
    FROM v_voting_results_analysis vr
    INNER JOIN municipality_features_2024 mf ON vr.municipality_id = mf.bfs_nr
    WHERE vr.municipality_id < 9000
    ORDER BY vr.voting_date, vr.proposal_id
""", conn)

# Load feature labels
labels_df = pd.read_sql_query("SELECT * FROM feature_labels_2024", conn)
conn.close()

# Create binary outcome
df['ja_mehrheit'] = (df['ja_prozent'] > 50).astype(int)

print(f"Datensaetze: {len(df):,}")
print(f"Abstimmungen: {df['proposal_id'].nunique()}")
print(f"Gemeinden: {df['municipality_id'].nunique()}")
print(f"\nJa-Mehrheit: {df['ja_mehrheit'].sum():,} ({100*df['ja_mehrheit'].mean():.1f}%)")

In [None]:
# Get unique proposals
proposals = df.groupby('proposal_id').agg({
    'voting_date': 'first',
    'title_de': 'first',
    'angenommen': 'first',
    'ja_prozent': 'mean'
}).reset_index()
proposals = proposals.sort_values('voting_date').reset_index(drop=True)
proposals.columns = ['proposal_id', 'voting_date', 'title_de', 'angenommen', 'mean_ja']

print(f"Anzahl Abstimmungen: {len(proposals)}")
proposals.head()

In [None]:
# Define feature columns and create label mappings
feature_cols = [
    'sprachgebiete',
    'stadt_land_typologie', 
    'grossregionen_der_schweiz',
    'urbanisierungsgrad_degurba_eurostat',
    'berggebiete',
    'agglomerationsgroessenklasse'
]

# Create label lookup
def get_label(feature_name, code):
    subset = labels_df[(labels_df['feature_name'] == feature_name) & (labels_df['code'] == code)]
    if len(subset) > 0:
        return subset['label'].iloc[0]
    return str(code)

# Nice feature names for display
feature_names_display = {
    'sprachgebiete': 'Sprachgebiet',
    'stadt_land_typologie': 'Stadt/Land',
    'grossregionen_der_schweiz': 'Grossregion',
    'urbanisierungsgrad_degurba_eurostat': 'Urbanisierung',
    'berggebiete': 'Berggebiet',
    'agglomerationsgroessenklasse': 'Agglomerationsgroesse'
}

print("Features fuer Regression:")
for col in feature_cols:
    n_cat = df[col].nunique()
    print(f"  - {feature_names_display[col]}: {n_cat} Kategorien")

## 2. Logistische Regression pro Abstimmung

In [None]:
def run_logistic_regression(df_vote, feature_cols):
    """
    Run logistic regression for a single voting.
    Returns model results and metrics.
    """
    # Prepare data with dummy encoding
    X = pd.get_dummies(df_vote[feature_cols], columns=feature_cols, drop_first=True)
    X = X.astype(float)  # Convert to float for statsmodels
    y = df_vote['ja_mehrheit'].astype(float)
    
    # Check if outcome has variance
    if y.nunique() < 2:
        return None, None, "Keine Varianz im Outcome"
    
    # Statsmodels for detailed results
    X_sm = sm.add_constant(X)
    try:
        model = sm.Logit(y, X_sm).fit(disp=0, maxiter=100)
    except Exception as e:
        return None, None, str(e)
    
    # Predictions
    y_pred_prob = model.predict(X_sm)
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    # Metrics
    accuracy = accuracy_score(y, y_pred)
    
    # ROC AUC
    try:
        fpr, tpr, _ = roc_curve(y, y_pred_prob)
        roc_auc = auc(fpr, tpr)
    except:
        roc_auc = np.nan
    
    metrics = {
        'accuracy': accuracy,
        'auc': roc_auc,
        'pseudo_r2': model.prsquared,
        'n': len(y),
        'n_ja': int(y.sum()),
        'pct_ja': 100 * y.mean()
    }
    
    return model, metrics, None

In [None]:
# Run logistic regression for all votings
results = []
models = {}

for _, prop in proposals.iterrows():
    proposal_id = prop['proposal_id']
    df_vote = df[df['proposal_id'] == proposal_id].copy()
    
    model, metrics, error = run_logistic_regression(df_vote, feature_cols)
    
    result = {
        'proposal_id': proposal_id,
        'voting_date': prop['voting_date'],
        'title_de': prop['title_de'],
        'angenommen': prop['angenommen'],
        'mean_ja': prop['mean_ja'],
        'error': error,
        'accuracy': np.nan,
        'auc': np.nan,
        'pseudo_r2': np.nan,
        'n': np.nan,
        'n_ja': np.nan,
        'pct_ja': np.nan
    }
    
    if metrics:
        result.update(metrics)
        models[proposal_id] = model
    
    results.append(result)

results_df = pd.DataFrame(results)
print(f"Modelle erfolgreich: {len(models)} / {len(proposals)}")
print(f"Fehler: {results_df['error'].notna().sum()}")

In [None]:
# Summary statistics
successful = results_df[results_df['error'].isna() & results_df['accuracy'].notna()].copy()

print("Modell-Performance (Zusammenfassung):")
print(f"  Accuracy - Mean: {successful['accuracy'].mean():.3f}, Median: {successful['accuracy'].median():.3f}")
print(f"  AUC - Mean: {successful['auc'].mean():.3f}, Median: {successful['auc'].median():.3f}")
print(f"  Pseudo R2 - Mean: {successful['pseudo_r2'].mean():.3f}, Median: {successful['pseudo_r2'].median():.3f}")

# Save results
results_df.to_csv(OUTPUT_DIR / 'logistic_regression_results.csv', index=False)
print(f"\nResultate gespeichert: {OUTPUT_DIR / 'logistic_regression_results.csv'}")

## 3. Visualisierungen erstellen

In [None]:
# Create plots directory
plots_dir = OUTPUT_DIR / 'plots'
plots_dir.mkdir(exist_ok=True)

def create_coefficient_plot(model, proposal_id, voting_date, title_de, output_path):
    """
    Create a coefficient plot (odds ratios) for the logistic regression.
    """
    # Extract coefficients (exclude constant)
    params = model.params[1:]  # Skip constant
    conf_int = model.conf_int().iloc[1:]  # Skip constant
    pvalues = model.pvalues[1:]
    
    # Calculate odds ratios
    odds_ratios = np.exp(params)
    ci_lower = np.exp(conf_int[0])
    ci_upper = np.exp(conf_int[1])
    
    # Create dataframe for plotting
    coef_df = pd.DataFrame({
        'feature': params.index,
        'odds_ratio': odds_ratios.values,
        'ci_lower': ci_lower.values,
        'ci_upper': ci_upper.values,
        'pvalue': pvalues.values
    })
    
    # Sort by odds ratio
    coef_df = coef_df.sort_values('odds_ratio', ascending=True)
    
    # Limit to top/bottom 15 for readability
    if len(coef_df) > 20:
        top = coef_df.nlargest(10, 'odds_ratio')
        bottom = coef_df.nsmallest(10, 'odds_ratio')
        coef_df = pd.concat([bottom, top]).drop_duplicates()
        coef_df = coef_df.sort_values('odds_ratio', ascending=True)
    
    # Create plot
    fig_height = max(5, len(coef_df) * 0.35)
    fig, ax = plt.subplots(figsize=(10, fig_height))
    
    # Colors based on significance
    colors = ['#27ae60' if p < 0.05 else '#95a5a6' for p in coef_df['pvalue']]
    
    # Plot odds ratios with confidence intervals
    y_pos = range(len(coef_df))
    ax.barh(y_pos, coef_df['odds_ratio'] - 1, left=1, color=colors, alpha=0.7, height=0.6)
    
    # Error bars for CI
    ax.errorbar(coef_df['odds_ratio'], y_pos, 
                xerr=[coef_df['odds_ratio'] - coef_df['ci_lower'], 
                      coef_df['ci_upper'] - coef_df['odds_ratio']],
                fmt='none', color='#2c3e50', capsize=3, capthick=1)
    
    # Reference line at OR=1
    ax.axvline(1, color='#e74c3c', linestyle='--', linewidth=2, alpha=0.7)
    
    # Labels
    ax.set_yticks(y_pos)
    ax.set_yticklabels(coef_df['feature'], fontsize=9)
    ax.set_xlabel('Odds Ratio', fontsize=11)
    
    # Title (truncated)
    title_short = title_de[:60] + '...' if len(title_de) > 60 else title_de
    ax.set_title(f'{voting_date}: {title_short}', fontsize=11, fontweight='bold', pad=10)
    
    # Clean up
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # Legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='#27ae60', alpha=0.7, label='Signifikant (p<0.05)'),
        Patch(facecolor='#95a5a6', alpha=0.7, label='Nicht signifikant'),
        plt.Line2D([0], [0], color='#e74c3c', linestyle='--', linewidth=2, label='OR = 1 (kein Effekt)')
    ]
    ax.legend(handles=legend_elements, loc='lower right', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=100, facecolor='white', bbox_inches='tight')
    plt.close()

print("Erstelle Koeffizienten-Plots...")

In [None]:
# Generate plots for all successful models
for proposal_id, model in models.items():
    prop = proposals[proposals['proposal_id'] == proposal_id].iloc[0]
    output_path = plots_dir / f'coef_{proposal_id:03d}.png'
    
    create_coefficient_plot(
        model, 
        proposal_id,
        prop['voting_date'],
        prop['title_de'],
        output_path
    )

print(f"Gespeichert: {len(models)} Koeffizienten-Plots in {plots_dir}")

## 4. Uebersichtsgrafiken

In [None]:
# Model performance overview
fig, axes = plt.subplots(1, 3, figsize=(14, 5))

# Accuracy distribution
ax1 = axes[0]
sns.histplot(successful['accuracy'], bins=20, color='steelblue', edgecolor='white', ax=ax1)
ax1.axvline(successful['accuracy'].median(), color='#e74c3c', linestyle='--', linewidth=2,
            label=f'Median: {successful["accuracy"].median():.3f}')
ax1.set_xlabel('Accuracy')
ax1.set_ylabel('Anzahl Abstimmungen')
ax1.set_title('Verteilung Accuracy', fontweight='bold')
ax1.legend()
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

# AUC distribution
ax2 = axes[1]
sns.histplot(successful['auc'].dropna(), bins=20, color='#27ae60', edgecolor='white', ax=ax2)
ax2.axvline(successful['auc'].median(), color='#e74c3c', linestyle='--', linewidth=2,
            label=f'Median: {successful["auc"].median():.3f}')
ax2.set_xlabel('AUC')
ax2.set_ylabel('Anzahl Abstimmungen')
ax2.set_title('Verteilung AUC', fontweight='bold')
ax2.legend()
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

# Pseudo R2 distribution
ax3 = axes[2]
sns.histplot(successful['pseudo_r2'], bins=20, color='#9b59b6', edgecolor='white', ax=ax3)
ax3.axvline(successful['pseudo_r2'].median(), color='#e74c3c', linestyle='--', linewidth=2,
            label=f'Median: {successful["pseudo_r2"].median():.3f}')
ax3.set_xlabel('McFadden Pseudo R²')
ax3.set_ylabel('Anzahl Abstimmungen')
ax3.set_title('Verteilung Pseudo R²', fontweight='bold')
ax3.legend()
ax3.spines['top'].set_visible(False)
ax3.spines['right'].set_visible(False)

plt.suptitle('Modell-Performance: Logistische Regression\n(alle 223 Abstimmungen)', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'model_performance_overview.png', dpi=150, facecolor='white', bbox_inches='tight')
plt.show()

In [None]:
# Best and worst models
if successful['auc'].notna().any():
    print("Top 5 Modelle (hoechste AUC):")
    top5 = successful.dropna(subset=['auc']).nlargest(5, 'auc')[['voting_date', 'title_de', 'auc', 'pseudo_r2', 'accuracy']]
    for _, row in top5.iterrows():
        print(f"  AUC={row['auc']:.3f}: {row['voting_date']} - {row['title_de'][:50]}...")

    print("\nBottom 5 Modelle (tiefste AUC):")
    bottom5 = successful.dropna(subset=['auc']).nsmallest(5, 'auc')[['voting_date', 'title_de', 'auc', 'pseudo_r2', 'accuracy']]
    for _, row in bottom5.iterrows():
        print(f"  AUC={row['auc']:.3f}: {row['voting_date']} - {row['title_de'][:50]}...")
else:
    print("Keine AUC-Werte verfuegbar.")

In [None]:
# Example: Best model details
if len(models) > 0 and successful['auc'].notna().any():
    best_idx = successful['auc'].idxmax()
    best_id = successful.loc[best_idx, 'proposal_id']
    best_model = models[best_id]
    best_prop = proposals[proposals['proposal_id'] == best_id].iloc[0]

    print(f"Bestes Modell: {best_prop['voting_date']} - {best_prop['title_de'][:60]}...")
    print("\nModell-Summary:")
    print(best_model.summary())
else:
    print("Keine erfolgreichen Modelle vorhanden.")

## 5. Zusammenfassung

In [None]:
print("="*60)
print("LOGISTISCHE REGRESSION - ZUSAMMENFASSUNG")
print("="*60)
print(f"Anzahl Abstimmungen: {len(proposals)}")
print(f"Erfolgreiche Modelle: {len(models)}")
print(f"Gemeinden pro Modell: ~{int(successful['n'].mean())}")
print()
print("Features (Praediktoren):")
for col in feature_cols:
    print(f"  - {feature_names_display[col]}")
print()
print("Modell-Performance:")
print(f"  Accuracy: {successful['accuracy'].mean():.1%} (Mean)")
print(f"  AUC: {successful['auc'].mean():.3f} (Mean)")
print(f"  Pseudo R²: {successful['pseudo_r2'].mean():.3f} (Mean)")
print()
print("Gespeicherte Dateien:")
print(f"  - {OUTPUT_DIR / 'logistic_regression_results.csv'}")
print(f"  - {OUTPUT_DIR / 'model_performance_overview.png'}")
print(f"  - {plots_dir}/ ({len(models)} Koeffizienten-Plots)")
print("="*60)