# EDA: Voting Results

Explorative Datenanalyse der Abstimmungsergebnisse (ja_prozent) pro Vorlage.

**Inhalt:**
- Deskriptive Statistik (min, max, mean, median, std) pro Vorlage
- Normalverteilungstest (Shapiro-Wilk) pro Vorlage
- Histogramme und Boxplots (einzeln)
- Überlagerndes Histogramm der extremsten Vorlagen
- Boxplot mit den zwei extremsten Vorlagen

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
import textwrap
import warnings
warnings.filterwarnings('ignore')

# Paths
DB_PATH = Path('../data/processed/swiss_votings.db')
OUTPUT_DIR = Path('Votings')
OUTPUT_DIR.mkdir(exist_ok=True)

# Seaborn style
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['axes.labelsize'] = 11

print(f"Database: {DB_PATH}")
print(f"Output directory: {OUTPUT_DIR}")

## 1. Daten laden

In [2]:
conn = sqlite3.connect(DB_PATH)

# Load voting results
df = pd.read_sql_query("""
    SELECT 
        proposal_id,
        voting_date,
        title_de,
        municipality_id,
        municipality_name,
        ja_prozent,
        stimmbeteiligung,
        angenommen
    FROM v_voting_results_analysis
    WHERE municipality_id < 9000  -- Exclude expat votes
    ORDER BY voting_date, proposal_id
""", conn)

conn.close()

print(f"Total records: {len(df):,}")
print(f"Proposals: {df['proposal_id'].nunique()}")
print(f"Municipalities: {df['municipality_id'].nunique()}")
print(f"Date range: {df['voting_date'].min()} to {df['voting_date'].max()}")

Total records: 454,134
Proposals: 223
Municipalities: 2038
Date range: 20000312 to 20250928


In [3]:
# Get unique proposals
proposals = df.groupby('proposal_id').agg({
    'voting_date': 'first',
    'title_de': 'first',
    'angenommen': 'first'
}).reset_index()
proposals = proposals.sort_values('voting_date')

print(f"Number of proposals: {len(proposals)}")
proposals.head(10)

Number of proposals: 223


Unnamed: 0,proposal_id,voting_date,title_de,angenommen
0,1,20000312,Bundesbeschluss über die Justizreform,1
1,2,20000312,Volksinitiative «für Beschleunigung der direkt...,0
2,3,20000312,Volksinitiative «für eine gerechte Vertretung ...,0
3,4,20000312,Volksinitiative «zum Schutze des Menschen vor ...,0
4,5,20000312,Volksinitiative «für die Halbierung des motori...,0
5,6,20000521,Bundesbeschluss über die Genehmigung der sekto...,1
6,7,20000924,Volksinitiative «für einen Solarrappen»,0
7,8,20000924,Verfassungsartikel über eine Förderabgabe für ...,0
8,9,20000924,Solarinitiative und Gegenvorschlag: Stichfrage,0
9,10,20000924,Verfassungsartikel über eine Energielenkungsab...,0


## 2. Deskriptive Statistik pro Vorlage

In [4]:
def compute_descriptive_stats(group):
    """Compute descriptive statistics for ja_prozent."""
    values = group['ja_prozent'].dropna()
    
    if len(values) < 3:
        return pd.Series({
            'n': len(values),
            'min': np.nan,
            'max': np.nan,
            'mean': np.nan,
            'median': np.nan,
            'std': np.nan,
            'q25': np.nan,
            'q75': np.nan
        })
    
    return pd.Series({
        'n': len(values),
        'min': values.min(),
        'max': values.max(),
        'mean': values.mean(),
        'median': values.median(),
        'std': values.std(),
        'q25': values.quantile(0.25),
        'q75': values.quantile(0.75)
    })

# Compute stats for each proposal
desc_stats = df.groupby('proposal_id').apply(compute_descriptive_stats).reset_index()

# Merge with proposal info
desc_stats = desc_stats.merge(
    proposals[['proposal_id', 'voting_date', 'title_de', 'angenommen']], 
    on='proposal_id'
)

# Reorder columns
desc_stats = desc_stats[['proposal_id', 'voting_date', 'title_de', 'angenommen', 
                          'n', 'min', 'max', 'mean', 'median', 'std', 'q25', 'q75']]

print(f"Descriptive statistics computed for {len(desc_stats)} proposals")
desc_stats.head(10)

Descriptive statistics computed for 223 proposals


Unnamed: 0,proposal_id,voting_date,title_de,angenommen,n,min,max,mean,median,std,q25,q75
0,1,20000312,Bundesbeschluss über die Justizreform,1,2036.0,30.99,100.0,82.435319,84.125,8.636842,78.4175,88.5025
1,2,20000312,Volksinitiative «für Beschleunigung der direkt...,0,2036.0,0.0,79.17,29.511017,29.495,7.05573,25.29,33.79
2,3,20000312,Volksinitiative «für eine gerechte Vertretung ...,0,2036.0,0.0,85.71,15.182677,14.345,5.888617,11.32,18.42
3,4,20000312,Volksinitiative «zum Schutze des Menschen vor ...,0,2036.0,0.0,72.73,26.068522,27.035,9.12518,19.635,31.9925
4,5,20000312,Volksinitiative «für die Halbierung des motori...,0,2036.0,0.0,70.69,15.826567,15.325,6.41396,11.36,19.64
5,6,20000521,Bundesbeschluss über die Genehmigung der sekto...,1,2036.0,12.12,96.0,62.815309,63.94,14.2874,53.25,73.9325
6,7,20000924,Volksinitiative «für einen Solarrappen»,0,2036.0,0.0,78.72,26.740521,26.11,8.250693,21.305,31.4525
7,8,20000924,Verfassungsartikel über eine Förderabgabe für ...,0,2036.0,12.5,100.0,42.062426,41.685,9.09526,35.9775,47.575
8,9,20000924,Solarinitiative und Gegenvorschlag: Stichfrage,0,2036.0,0.0,75.0,30.999042,31.0,6.701161,27.0775,34.74
9,10,20000924,Verfassungsartikel über eine Energielenkungsab...,0,2036.0,6.67,81.82,38.634995,38.495,9.891268,31.93,45.2525


## 3. Normalverteilungstest (Shapiro-Wilk)

In [5]:
def shapiro_test(group):
    """Perform Shapiro-Wilk normality test."""
    values = group['ja_prozent'].dropna()
    
    if len(values) < 3:
        return pd.Series({
            'shapiro_stat': np.nan,
            'shapiro_pvalue': np.nan,
            'is_normal_alpha05': np.nan
        })
    
    # Shapiro-Wilk test (sample max 5000 for performance)
    if len(values) > 5000:
        values = values.sample(5000, random_state=42)
    
    try:
        stat, pvalue = stats.shapiro(values)
        return pd.Series({
            'shapiro_stat': stat,
            'shapiro_pvalue': pvalue,
            'is_normal_alpha05': pvalue > 0.05
        })
    except:
        return pd.Series({
            'shapiro_stat': np.nan,
            'shapiro_pvalue': np.nan,
            'is_normal_alpha05': np.nan
        })

# Perform normality test for each proposal
normality_tests = df.groupby('proposal_id').apply(shapiro_test).reset_index()

# Merge with descriptive stats
eda_results = desc_stats.merge(normality_tests, on='proposal_id')

# Summary
n_normal = eda_results['is_normal_alpha05'].sum()
n_total = len(eda_results)
print(f"Normality test results (alpha=0.05):")
print(f"  Normal: {n_normal} ({100*n_normal/n_total:.1f}%)")
print(f"  Not normal: {n_total - n_normal} ({100*(n_total-n_normal)/n_total:.1f}%)")

Normality test results (alpha=0.05):
  Normal: 3 (1.3%)
  Not normal: 220 (98.7%)


## 4. Resultate speichern

In [6]:
# Save to CSV
csv_path = OUTPUT_DIR / 'voting_eda_results.csv'
eda_results.to_csv(csv_path, index=False)
print(f"Results saved to: {csv_path}")

# Display full results
eda_results.round(3)

Results saved to: Votings/voting_eda_results.csv


Unnamed: 0,proposal_id,voting_date,title_de,angenommen,n,min,max,mean,median,std,q25,q75,shapiro_stat,shapiro_pvalue,is_normal_alpha05
0,1,20000312,Bundesbeschluss über die Justizreform,1,2036.0,30.99,100.00,82.435,84.125,8.637,78.418,88.502,0.912,0.0,False
1,2,20000312,Volksinitiative «für Beschleunigung der direkt...,0,2036.0,0.00,79.17,29.511,29.495,7.056,25.290,33.790,0.984,0.0,False
2,3,20000312,Volksinitiative «für eine gerechte Vertretung ...,0,2036.0,0.00,85.71,15.183,14.345,5.889,11.320,18.420,0.931,0.0,False
3,4,20000312,Volksinitiative «zum Schutze des Menschen vor ...,0,2036.0,0.00,72.73,26.069,27.035,9.125,19.635,31.992,0.984,0.0,False
4,5,20000312,Volksinitiative «für die Halbierung des motori...,0,2036.0,0.00,70.69,15.827,15.325,6.414,11.360,19.640,0.968,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,219,20241124,Änderung des Obligationenrechts (Mietrecht: Kü...,0,2038.0,10.53,86.21,52.802,53.420,8.376,48.040,58.370,0.981,0.0,False
219,220,20241124,Änderung des Bundesgesetzes über die Krankenve...,1,2038.0,16.67,80.00,53.443,55.805,10.217,46.515,61.040,0.959,0.0,False
220,221,20250209,Volksinitiative «Für eine verantwortungsvolle ...,0,2038.0,0.00,60.11,24.667,23.645,8.692,18.268,30.452,0.986,0.0,False
221,222,20250928,Bundesbeschluss über die kantonalen Liegenscha...,1,2038.0,17.14,88.64,58.886,62.090,14.448,46.058,71.090,0.946,0.0,False


## 5. Histogramme (einzeln pro Vorlage)

In [None]:
# Create histogram directory
hist_dir = OUTPUT_DIR / 'histograms'
hist_dir.mkdir(exist_ok=True)

# Generate histogram for each proposal with seaborn styling
for _, row in proposals.iterrows():
    proposal_id = row['proposal_id']
    title = row['title_de'][:50] + '...' if len(row['title_de']) > 50 else row['title_de']
    date = row['voting_date']
    
    values = df[df['proposal_id'] == proposal_id]['ja_prozent'].dropna()
    
    fig, ax = plt.subplots(figsize=(9, 5))
    
    # Seaborn histogram
    sns.histplot(values, bins=30, color='steelblue', edgecolor='white', alpha=0.8, ax=ax)
    
    # Mean and median lines
    ax.axvline(values.mean(), color='#e74c3c', linestyle='--', linewidth=2, 
               label=f'Mean: {values.mean():.1f}%')
    ax.axvline(values.median(), color='#27ae60', linestyle='--', linewidth=2, 
               label=f'Median: {values.median():.1f}%')
    ax.axvline(50, color='#f39c12', linestyle='-', linewidth=2, alpha=0.6, 
               label='50% Schwelle')
    
    ax.set_xlabel('Ja-Anteil (%)')
    ax.set_ylabel('Anzahl Gemeinden')
    ax.set_title(f'{date}: {title}', fontweight='bold')
    ax.legend(loc='upper right', framealpha=0.9)
    ax.set_xlim(0, 100)
    
    # Clean up
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    plt.savefig(hist_dir / f'hist_{proposal_id:03d}.png', dpi=100, facecolor='white')
    plt.close()

print(f"Saved {len(proposals)} histograms to {hist_dir}")

## 6. Boxplots (einzeln pro Vorlage)

In [None]:
# Create boxplot directory
box_dir = OUTPUT_DIR / 'boxplots'
box_dir.mkdir(exist_ok=True)

# Generate boxplot for each proposal with seaborn styling
for _, row in proposals.iterrows():
    proposal_id = row['proposal_id']
    title = row['title_de'][:50] + '...' if len(row['title_de']) > 50 else row['title_de']
    date = row['voting_date']
    
    values = df[df['proposal_id'] == proposal_id]['ja_prozent'].dropna()
    
    fig, ax = plt.subplots(figsize=(9, 3.5))
    
    # Seaborn boxplot (horizontal)
    sns.boxplot(x=values, color='steelblue', width=0.5, ax=ax,
                flierprops={'marker': 'o', 'markersize': 4, 'alpha': 0.5})
    
    ax.axvline(50, color='#f39c12', linestyle='-', linewidth=2, alpha=0.6)
    ax.set_xlabel('Ja-Anteil (%)')
    ax.set_title(f'{date}: {title}', fontweight='bold')
    ax.set_xlim(0, 100)
    
    # Clean up
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.set_yticks([])
    
    plt.tight_layout()
    plt.savefig(box_dir / f'box_{proposal_id:03d}.png', dpi=100, facecolor='white')
    plt.close()

print(f"Saved {len(proposals)} boxplots to {box_dir}")

## 7. Überlagerndes Histogramm der extremsten Vorlagen

In [None]:
# Find the two most extreme proposals (highest and lowest mean ja_prozent)
highest = eda_results.loc[eda_results['mean'].idxmax()]
lowest = eda_results.loc[eda_results['mean'].idxmin()]

# Get data for extreme proposals
values_highest = df[df['proposal_id'] == highest['proposal_id']]['ja_prozent'].dropna()
values_lowest = df[df['proposal_id'] == lowest['proposal_id']]['ja_prozent'].dropna()

# Create overlapping histogram with seaborn styling
fig, ax = plt.subplots(figsize=(12, 7))

# Plot histograms with seaborn colors
sns.histplot(values_lowest, bins=30, alpha=0.6, color='#e74c3c', edgecolor='#c0392b',
             label='_nolegend_', ax=ax)
sns.histplot(values_highest, bins=30, alpha=0.6, color='#27ae60', edgecolor='#1e8449',
             label='_nolegend_', ax=ax)

ax.axvline(50, color='#f39c12', linestyle='-', linewidth=2.5, alpha=0.8)

ax.set_xlabel('Ja-Anteil (%)', fontsize=12)
ax.set_ylabel('Anzahl Gemeinden', fontsize=12)
ax.set_title('Vergleich: Extremste Abstimmungsergebnisse', fontsize=14, fontweight='bold', pad=15)
ax.set_xlim(0, 100)

# Clean up spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Create custom legend with full titles (wrapped)
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#e74c3c', alpha=0.6, edgecolor='#c0392b',
          label=f"Tiefste Zustimmung ({lowest['mean']:.1f}%):\n{lowest['voting_date']}: {lowest['title_de']}"),
    Patch(facecolor='#27ae60', alpha=0.6, edgecolor='#1e8449',
          label=f"Hoechste Zustimmung ({highest['mean']:.1f}%):\n{highest['voting_date']}: {highest['title_de']}"),
    plt.Line2D([0], [0], color='#f39c12', linewidth=2.5, label='50% Schwelle')
]

# Legend below the plot
ax.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(0.5, -0.12),
          ncol=1, fontsize=10, frameon=True, fancybox=True, shadow=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.28)
plt.savefig(OUTPUT_DIR / 'histogram_overlay_extreme.png', dpi=150, facecolor='white', 
            bbox_inches='tight')
plt.show()
print(f"Saved overlay histogram to {OUTPUT_DIR / 'histogram_overlay_extreme.png'}")

## 8. Boxplot mit den zwei extremsten Vorlagen

In [10]:
# Find the two most extreme proposals (highest and lowest mean ja_prozent)
highest = eda_results.loc[eda_results['mean'].idxmax()]
lowest = eda_results.loc[eda_results['mean'].idxmin()]

print("Höchste Zustimmung:")
print(f"  {highest['voting_date']}: {highest['title_de'][:60]}...")
print(f"  Mean: {highest['mean']:.1f}%, Median: {highest['median']:.1f}%")
print()
print("Tiefste Zustimmung:")
print(f"  {lowest['voting_date']}: {lowest['title_de'][:60]}...")
print(f"  Mean: {lowest['mean']:.1f}%, Median: {lowest['median']:.1f}%")

Höchste Zustimmung:
  20140518: Bundesbeschluss über die medizinische Grundversorgung (direk...
  Mean: 87.3%, Median: 87.8%

Tiefste Zustimmung:
  20150308: Volksinitiative «Energie- statt Mehrwertsteuer»...
  Mean: 6.1%, Median: 5.8%


In [None]:
# Prepare data for vertical boxplot
boxplot_data = pd.DataFrame({
    'Ja-Anteil (%)': pd.concat([values_lowest, values_highest]),
    'Vorlage': ['Tiefste'] * len(values_lowest) + ['Hoechste'] * len(values_highest)
})

# Create vertical boxplot with seaborn
fig, ax = plt.subplots(figsize=(8, 8))

# Vertical boxplot
palette = {'Tiefste': '#e74c3c', 'Hoechste': '#27ae60'}
sns.boxplot(data=boxplot_data, x='Vorlage', y='Ja-Anteil (%)', 
            palette=palette, width=0.5, ax=ax,
            flierprops={'marker': 'o', 'markersize': 4, 'alpha': 0.5})

# 50% threshold line
ax.axhline(50, color='#f39c12', linestyle='-', linewidth=2.5, alpha=0.8, zorder=0)

ax.set_ylabel('Ja-Anteil (%)', fontsize=12)
ax.set_xlabel('')
ax.set_ylim(0, 100)
ax.set_title('Vergleich: Extremste Abstimmungsergebnisse', fontsize=14, fontweight='bold', pad=15)

# Remove x-axis labels (will be in legend)
ax.set_xticklabels(['', ''])

# Clean up spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Create custom legend with full titles below the plot
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#e74c3c', edgecolor='darkred',
          label=f"Tiefste Zustimmung (Mean: {lowest['mean']:.1f}%):\n{lowest['voting_date']}: {lowest['title_de']}"),
    Patch(facecolor='#27ae60', edgecolor='darkgreen',
          label=f"Hoechste Zustimmung (Mean: {highest['mean']:.1f}%):\n{highest['voting_date']}: {highest['title_de']}"),
    plt.Line2D([0], [0], color='#f39c12', linewidth=2.5, label='50% Schwelle')
]

ax.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(0.5, -0.08),
          ncol=1, fontsize=10, frameon=True, fancybox=True, shadow=False)

plt.tight_layout()
plt.subplots_adjust(bottom=0.30)
plt.savefig(OUTPUT_DIR / 'boxplot_extreme_comparison.png', dpi=150, facecolor='white',
            bbox_inches='tight')
plt.show()
print(f"Saved extreme comparison boxplot to {OUTPUT_DIR / 'boxplot_extreme_comparison.png'}")

## 9. Zusammenfassung

In [12]:
print("="*60)
print("EDA ZUSAMMENFASSUNG")
print("="*60)
print(f"Anzahl Vorlagen: {len(proposals)}")
print(f"Anzahl Gemeinden: {df['municipality_id'].nunique()}")
print(f"Zeitraum: {df['voting_date'].min()} bis {df['voting_date'].max()}")
print()
print("Deskriptive Statistik (ja_prozent über alle Vorlagen):")
print(f"  Mittlerer Durchschnitt: {eda_results['mean'].mean():.1f}%")
print(f"  Min (Vorlage): {eda_results['mean'].min():.1f}%")
print(f"  Max (Vorlage): {eda_results['mean'].max():.1f}%")
print()
print("Normalverteilung (Shapiro-Wilk, alpha=0.05):")
print(f"  Normalverteilt: {int(eda_results['is_normal_alpha05'].sum())} Vorlagen")
print(f"  Nicht normalverteilt: {int((~eda_results['is_normal_alpha05']).sum())} Vorlagen")
print()
print("Gespeicherte Dateien:")
print(f"  - {OUTPUT_DIR / 'voting_eda_results.csv'}")
print(f"  - {OUTPUT_DIR / 'histograms/'} ({len(proposals)} Dateien)")
print(f"  - {OUTPUT_DIR / 'boxplots/'} ({len(proposals)} Dateien)")
print(f"  - {OUTPUT_DIR / 'histogram_overlay_extreme.png'}")
print(f"  - {OUTPUT_DIR / 'boxplot_extreme_comparison.png'}")
print("="*60)

EDA ZUSAMMENFASSUNG
Anzahl Vorlagen: 223
Anzahl Gemeinden: 2038
Zeitraum: 20000312 bis 20250928

Deskriptive Statistik (ja_prozent über alle Vorlagen):
  Mittlerer Durchschnitt: 45.9%
  Min (Vorlage): 6.1%
  Max (Vorlage): 87.3%

Normalverteilung (Shapiro-Wilk, alpha=0.05):
  Normalverteilt: 3 Vorlagen
  Nicht normalverteilt: 220 Vorlagen

Gespeicherte Dateien:
  - Votings/voting_eda_results.csv
  - Votings/histograms (223 Dateien)
  - Votings/boxplots (223 Dateien)
  - Votings/histogram_overlay_extreme.png
  - Votings/boxplot_extreme_comparison.png
