# Swiss Voting Data - Factor Analysis

This notebook performs factor analysis on Swiss federal voting data (2000-2025) to create a spatial representation of the political landscape of Swiss municipalities.

## Goal
- Extract 2-3 latent dimensions from voting patterns
- Label dimensions (e.g., left-right, conservative-liberal)
- Position municipalities in the resulting political space

## Data Source
- 223 federal votes from 2000-2025
- 2,121 municipalities (analysis-ready structure with mergers handled)
- Data exported from `v_voting_results_analysis` view

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

## 1. Load and Prepare Data

In [None]:
# Load the CSV
df = pd.read_csv('data/voting_results_export.csv')

print(f"Total rows loaded: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"\nFirst few columns: {df.columns[:10].tolist()}")

In [None]:
# Filter to only municipalities (not districts/cantons)
df_muni = df[df['geo_type'] == 'municipality'].copy()

print(f"Municipalities: {len(df_muni)}")
print(f"Districts: {len(df[df['geo_type'] == 'district'])}")
print(f"Cantons: {len(df[df['geo_type'] == 'canton'])}")

In [None]:
# Extract only percentage columns (these contain the Yes vote %)
pct_cols = [col for col in df_muni.columns if col.endswith('_pct')]

print(f"Number of voting proposals with percentage data: {len(pct_cols)}")
print(f"\nExample columns: {pct_cols[:5]}")

In [None]:
# Create dataframe with municipality info + percentage columns
X = df_muni[pct_cols].copy()
municipality_info = df_muni[['geo_id', 'geo_name']].copy()

print(f"Shape before removing NULLs: {X.shape}")
print(f"Number of NaN values: {X.isna().sum().sum()}")
print(f"Rows with any NaN: {X.isna().any(axis=1).sum()}")

In [None]:
# Remove rows with any NULL values
mask_complete = ~X.isna().any(axis=1)
X_clean = X[mask_complete].copy()
municipality_info_clean = municipality_info[mask_complete].copy()

print(f"Shape after removing NULLs: {X_clean.shape}")
print(f"Municipalities retained: {len(X_clean)}")
print(f"Municipalities removed: {len(X) - len(X_clean)}")
print(f"Percentage retained: {len(X_clean)/len(X)*100:.1f}%")

## 2. Data Exploration

In [None]:
# Summary statistics
print("Summary Statistics of Voting Percentages:")
print(X_clean.describe())

In [None]:
# Distribution of Yes percentages across all votings
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Overall distribution
axes[0].hist(X_clean.values.flatten(), bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Yes Vote Percentage')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Yes Votes Across All Municipalities and Votings')
axes[0].axvline(50, color='red', linestyle='--', label='50% threshold')
axes[0].legend()

# Average Yes percentage per municipality
mean_yes = X_clean.mean(axis=1)
axes[1].hist(mean_yes, bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1].set_xlabel('Average Yes Vote Percentage')
axes[1].set_ylabel('Number of Municipalities')
axes[1].set_title('Average Yes Vote % per Municipality')
axes[1].axvline(mean_yes.mean(), color='red', linestyle='--', label=f'Overall mean: {mean_yes.mean():.1f}%')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Overall mean Yes percentage: {X_clean.values.mean():.2f}%")
print(f"Std deviation: {X_clean.values.std():.2f}%")

## 3. Standardize Data

In [None]:
# Standardize the data (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)

print(f"Scaled data shape: {X_scaled.shape}")
print(f"Mean after scaling: {X_scaled.mean():.6f}")
print(f"Std after scaling: {X_scaled.std():.6f}")

## 4. Factor Analysis - 2 Dimensions

In [None]:
# Perform factor analysis with 2 factors
fa_2d = FactorAnalysis(n_components=2, random_state=42, max_iter=1000)
factors_2d = fa_2d.fit_transform(X_scaled)

# Get factor loadings
loadings_2d = fa_2d.components_.T

print(f"2D Factor Analysis Complete")
print(f"Factor scores shape: {factors_2d.shape}")
print(f"Loadings shape: {loadings_2d.shape}")
print(f"\nVariance explained (noise variance): {fa_2d.noise_variance_.mean():.4f}")

In [None]:
# Visualize municipalities in 2D factor space
plt.figure(figsize=(14, 10))
plt.scatter(factors_2d[:, 0], factors_2d[:, 1], alpha=0.5, s=20)
plt.xlabel('Factor 1', fontsize=14)
plt.ylabel('Factor 2', fontsize=14)
plt.title('Swiss Municipalities in 2D Political Space\n(Factor Analysis on 223 Federal Votes, 2000-2025)', fontsize=16)
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(0, color='gray', linestyle='--', alpha=0.5)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Factor 1 range: [{factors_2d[:, 0].min():.2f}, {factors_2d[:, 0].max():.2f}]")
print(f"Factor 2 range: [{factors_2d[:, 1].min():.2f}, {factors_2d[:, 1].max():.2f}]")

## 5. Factor Analysis - 3 Dimensions

In [None]:
# Perform factor analysis with 3 factors
fa_3d = FactorAnalysis(n_components=3, random_state=42, max_iter=1000)
factors_3d = fa_3d.fit_transform(X_scaled)

# Get factor loadings
loadings_3d = fa_3d.components_.T

print(f"3D Factor Analysis Complete")
print(f"Factor scores shape: {factors_3d.shape}")
print(f"Loadings shape: {loadings_3d.shape}")
print(f"\nVariance explained (noise variance): {fa_3d.noise_variance_.mean():.4f}")

In [None]:
# Visualize 3D factors in 2D projections
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Factor 1 vs Factor 2
axes[0].scatter(factors_3d[:, 0], factors_3d[:, 1], alpha=0.5, s=20)
axes[0].set_xlabel('Factor 1')
axes[0].set_ylabel('Factor 2')
axes[0].set_title('Factor 1 vs Factor 2')
axes[0].axhline(0, color='gray', linestyle='--', alpha=0.5)
axes[0].axvline(0, color='gray', linestyle='--', alpha=0.5)
axes[0].grid(True, alpha=0.3)

# Factor 1 vs Factor 3
axes[1].scatter(factors_3d[:, 0], factors_3d[:, 2], alpha=0.5, s=20, color='green')
axes[1].set_xlabel('Factor 1')
axes[1].set_ylabel('Factor 3')
axes[1].set_title('Factor 1 vs Factor 3')
axes[1].axhline(0, color='gray', linestyle='--', alpha=0.5)
axes[1].axvline(0, color='gray', linestyle='--', alpha=0.5)
axes[1].grid(True, alpha=0.3)

# Factor 2 vs Factor 3
axes[2].scatter(factors_3d[:, 1], factors_3d[:, 2], alpha=0.5, s=20, color='red')
axes[2].set_xlabel('Factor 2')
axes[2].set_ylabel('Factor 3')
axes[2].set_title('Factor 2 vs Factor 3')
axes[2].axhline(0, color='gray', linestyle='--', alpha=0.5)
axes[2].axvline(0, color='gray', linestyle='--', alpha=0.5)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 3D scatter plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(factors_3d[:, 0], factors_3d[:, 1], factors_3d[:, 2], 
           alpha=0.5, s=20, c=factors_3d[:, 0], cmap='RdBu_r')
ax.set_xlabel('Factor 1', fontsize=12)
ax.set_ylabel('Factor 2', fontsize=12)
ax.set_zlabel('Factor 3', fontsize=12)
ax.set_title('Swiss Municipalities in 3D Political Space', fontsize=14)

plt.tight_layout()
plt.show()

## 6. Compare 2D vs 3D Models

In [None]:
# Compare explained variance (using noise variance as proxy)
print("Model Comparison:")
print("=" * 60)
print(f"2D Model - Average noise variance: {fa_2d.noise_variance_.mean():.4f}")
print(f"3D Model - Average noise variance: {fa_3d.noise_variance_.mean():.4f}")
print(f"\nImprovement with 3D: {(fa_2d.noise_variance_.mean() - fa_3d.noise_variance_.mean()) / fa_2d.noise_variance_.mean() * 100:.2f}%")
print("\nLower noise variance = better model fit")

## 7. Analyze Factor Loadings

Identify which votings load heavily on each factor to interpret dimensions

In [None]:
# Create dataframe of loadings for 2D model
loadings_2d_df = pd.DataFrame(
    loadings_2d,
    columns=['Factor_1', 'Factor_2'],
    index=pct_cols
)

# Extract proposal IDs from column names
loadings_2d_df['proposal_id'] = loadings_2d_df.index.str.extract(r'(\d+)_pct')[0]

print("Top 10 Votings Loading on Factor 1 (Positive):")
print(loadings_2d_df.nlargest(10, 'Factor_1')[['Factor_1', 'Factor_2']])
print("\nTop 10 Votings Loading on Factor 1 (Negative):")
print(loadings_2d_df.nsmallest(10, 'Factor_1')[['Factor_1', 'Factor_2']])

In [None]:
print("\nTop 10 Votings Loading on Factor 2 (Positive):")
print(loadings_2d_df.nlargest(10, 'Factor_2')[['Factor_1', 'Factor_2']])
print("\nTop 10 Votings Loading on Factor 2 (Negative):")
print(loadings_2d_df.nsmallest(10, 'Factor_2')[['Factor_1', 'Factor_2']])

In [None]:
# Visualize factor loadings
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Factor 1 loadings
axes[0].hist(loadings_2d[:, 0], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Loading Value')
axes[0].set_ylabel('Number of Votings')
axes[0].set_title('Distribution of Factor 1 Loadings')
axes[0].axvline(0, color='red', linestyle='--')

# Factor 2 loadings
axes[1].hist(loadings_2d[:, 1], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1].set_xlabel('Loading Value')
axes[1].set_ylabel('Number of Votings')
axes[1].set_title('Distribution of Factor 2 Loadings')
axes[1].axvline(0, color='red', linestyle='--')

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot of loadings
plt.figure(figsize=(12, 10))
plt.scatter(loadings_2d[:, 0], loadings_2d[:, 1], alpha=0.6, s=50)
plt.xlabel('Factor 1 Loading', fontsize=12)
plt.ylabel('Factor 2 Loading', fontsize=12)
plt.title('Factor Loadings: How Each Voting Contributes to Each Dimension', fontsize=14)
plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
plt.axvline(0, color='gray', linestyle='--', alpha=0.5)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Label Dimensions Based on Key Votings

Connect to database to get actual voting titles for interpretation

In [None]:
import sqlite3

# Connect to database
conn = sqlite3.connect('data/swiss_votings.db')

# Get proposal information
proposals_df = pd.read_sql_query("""
    SELECT 
        p.proposal_id,
        p.vorlage_id,
        p.title_de,
        p.angenommen,
        v.voting_date
    FROM proposals p
    JOIN votings v ON p.voting_id = v.voting_id
    ORDER BY v.voting_date
""", conn)

print(f"Loaded {len(proposals_df)} proposals")
print(proposals_df.head())

In [None]:
# Merge loadings with proposal titles
loadings_2d_df_merged = loadings_2d_df.copy()

# Clean up proposal_id extraction - handle NaN values
loadings_2d_df_merged['proposal_id'] = pd.to_numeric(
    loadings_2d_df_merged['proposal_id'], 
    errors='coerce'
)

# Remove rows with NaN proposal_id
loadings_2d_df_merged = loadings_2d_df_merged[loadings_2d_df_merged['proposal_id'].notna()].copy()
loadings_2d_df_merged['proposal_id'] = loadings_2d_df_merged['proposal_id'].astype(int)

# Merge with proposal information
loadings_2d_df_merged = loadings_2d_df_merged.merge(
    proposals_df[['proposal_id', 'title_de', 'voting_date', 'angenommen']], 
    on='proposal_id', 
    how='left'
)

print(f"Successfully matched {len(loadings_2d_df_merged)} proposals")
print("\n=== FACTOR 1: Top Positive Loadings ===")
print(loadings_2d_df_merged.nlargest(10, 'Factor_1')[['voting_date', 'title_de', 'Factor_1', 'angenommen']])

print("\n=== FACTOR 1: Top Negative Loadings ===")
print(loadings_2d_df_merged.nsmallest(10, 'Factor_1')[['voting_date', 'title_de', 'Factor_1', 'angenommen']])

In [None]:
print("\n=== FACTOR 2: Top Positive Loadings ===")
print(loadings_2d_df_merged.nlargest(10, 'Factor_2')[['voting_date', 'title_de', 'Factor_2', 'angenommen']])

print("\n=== FACTOR 2: Top Negative Loadings ===")
print(loadings_2d_df_merged.nsmallest(10, 'Factor_2')[['voting_date', 'title_de', 'Factor_2', 'angenommen']])

## 9. Create Final Political Space Visualization

Plot municipalities with dimension labels based on interpretation of factor loadings

In [None]:
# Create labeled visualization
# Based on the loadings analysis above, label the dimensions
# (You'll need to interpret the results and adjust labels accordingly)

plt.figure(figsize=(16, 12))

# Create scatter with color gradient
scatter = plt.scatter(factors_2d[:, 0], factors_2d[:, 1], 
                     c=factors_2d[:, 0], cmap='RdBu_r', 
                     alpha=0.6, s=30, edgecolors='black', linewidth=0.5)

# Add axis labels (to be interpreted based on factor loadings)
plt.xlabel('Factor 1 (Dimension 1)', fontsize=14, fontweight='bold')
plt.ylabel('Factor 2 (Dimension 2)', fontsize=14, fontweight='bold')
plt.title('Swiss Municipalities in Political Space\nFactor Analysis on 223 Federal Votes (2000-2025)', 
         fontsize=16, fontweight='bold', pad=20)

# Add reference lines
plt.axhline(0, color='gray', linestyle='--', alpha=0.5, linewidth=1.5)
plt.axvline(0, color='gray', linestyle='--', alpha=0.5, linewidth=1.5)

# Add colorbar
cbar = plt.colorbar(scatter, label='Factor 1 Score')
cbar.set_label('Factor 1 Score', fontsize=12)

# Add grid
plt.grid(True, alpha=0.3, linestyle=':')

# Add quadrant labels (adjust based on interpretation)
plt.text(0.95, 0.95, 'Quadrant I', transform=plt.gca().transAxes, 
         fontsize=10, ha='right', va='top', alpha=0.5)
plt.text(0.05, 0.95, 'Quadrant II', transform=plt.gca().transAxes, 
         fontsize=10, ha='left', va='top', alpha=0.5)
plt.text(0.05, 0.05, 'Quadrant III', transform=plt.gca().transAxes, 
         fontsize=10, ha='left', va='bottom', alpha=0.5)
plt.text(0.95, 0.05, 'Quadrant IV', transform=plt.gca().transAxes, 
         fontsize=10, ha='right', va='bottom', alpha=0.5)

plt.tight_layout()
plt.savefig('political_space_2d.png', dpi=300, bbox_inches='tight')
plt.show()

print("Plot saved as 'political_space_2d.png'")

## 10. Export Results

In [None]:
# Create results dataframe with municipality positions
results_2d = municipality_info_clean.copy()
results_2d['factor_1'] = factors_2d[:, 0]
results_2d['factor_2'] = factors_2d[:, 1]

results_3d = municipality_info_clean.copy()
results_3d['factor_1'] = factors_3d[:, 0]
results_3d['factor_2'] = factors_3d[:, 1]
results_3d['factor_3'] = factors_3d[:, 2]

# Save to CSV
results_2d.to_csv('municipality_political_space_2d.csv', index=False)
results_3d.to_csv('municipality_political_space_3d.csv', index=False)

print("Results exported:")
print("  - municipality_political_space_2d.csv")
print("  - municipality_political_space_3d.csv")
print(f"\nTotal municipalities positioned: {len(results_2d)}")

## Summary

This notebook performed factor analysis on Swiss federal voting data to create a spatial representation of the political landscape.

### Key Findings:
1. **Data**: Used 223 voting proposals across 2,121 municipalities
2. **Models**: Compared 2D and 3D factor analysis
3. **Dimensions**: Interpreted based on factor loadings
4. **Output**: Municipality positions in political space

### Next Steps:
- Interpret dimensions based on high-loading votings
- Label axes (e.g., left-right, conservative-liberal)
- Analyze clustering patterns
- Compare with canton or language region groupings