# EDA: Gemeindemerkmale (Features)

Explorative Datenanalyse der kategorischen Gemeindemerkmale aus `municipality_features_2024`.

**Inhalt:**
- Haeufigkeitsverteilungen aller Features
- Balkendiagramme mit Labels
- Export der Haeufigkeitstabellen als CSV

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Paths
DB_PATH = Path('../data/processed/swiss_votings.db')
OUTPUT_DIR = Path('Features')
OUTPUT_DIR.mkdir(exist_ok=True)

# Seaborn style
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['axes.labelsize'] = 11

print(f"Database: {DB_PATH}")
print(f"Output directory: {OUTPUT_DIR}")

## 1. Daten laden

In [None]:
conn = sqlite3.connect(DB_PATH)

# Load features
df = pd.read_sql_query("SELECT * FROM municipality_features_2024", conn)

# Load labels
labels_df = pd.read_sql_query("SELECT * FROM feature_labels_2024", conn)

conn.close()

print(f"Gemeinden: {len(df)}")
print(f"Features: {len(df.columns) - 6} (ohne ID-Spalten)")
print(f"Labels verfuegbar: {labels_df['feature_name'].nunique()}")

In [None]:
# Define feature columns (exclude identifier columns)
id_cols = ['bfs_nr', 'gemeindename', 'kanton_nr', 'kanton', 'bezirk_nr', 'bezirksname']
feature_cols = [col for col in df.columns if col not in id_cols]

# Remove duplicate columns
feature_cols_clean = [
    col for col in feature_cols 
    if col not in ['agglomerationen_2020_1', 'urbanisierungsgrad_degurba_eurostat_1']
]

print(f"Feature-Spalten fuer Analyse: {len(feature_cols_clean)}")
for col in feature_cols_clean:
    print(f"  - {col}")

In [None]:
# Create label lookup function
def get_labels(feature_name):
    """Get label mapping for a feature."""
    subset = labels_df[labels_df['feature_name'] == feature_name]
    if len(subset) == 0:
        return {}
    return dict(zip(subset['code'], subset['label']))

# Test
print("Beispiel Labels fuer 'sprachgebiete':")
print(get_labels('sprachgebiete'))

## 2. Haeufigkeitsverteilungen berechnen

In [None]:
def compute_frequency_table(df, feature_name):
    """Compute frequency table with labels."""
    # Get value counts
    counts = df[feature_name].value_counts().sort_index()
    
    # Create dataframe
    freq_df = pd.DataFrame({
        'code': counts.index,
        'count': counts.values,
        'percent': (counts.values / len(df) * 100).round(2)
    })
    
    # Add labels
    label_map = get_labels(feature_name)
    freq_df['label'] = freq_df['code'].map(label_map).fillna('Unbekannt')
    
    # Reorder columns
    freq_df = freq_df[['code', 'label', 'count', 'percent']]
    
    return freq_df

# Compute all frequency tables
freq_tables = {}
for col in feature_cols_clean:
    freq_tables[col] = compute_frequency_table(df, col)

print(f"Haeufigkeitstabellen erstellt: {len(freq_tables)}")

## 3. Haeufigkeitstabellen anzeigen und exportieren

In [None]:
# Export all frequency tables to CSV
all_freq = []

for feature_name, freq_df in freq_tables.items():
    freq_df_copy = freq_df.copy()
    freq_df_copy.insert(0, 'feature', feature_name)
    all_freq.append(freq_df_copy)

# Combine all
all_freq_df = pd.concat(all_freq, ignore_index=True)

# Save to CSV
csv_path = OUTPUT_DIR / 'feature_frequencies.csv'
all_freq_df.to_csv(csv_path, index=False)
print(f"Alle Haeufigkeiten gespeichert: {csv_path}")
print(f"Total Zeilen: {len(all_freq_df)}")

In [None]:
# Display summary for each feature
summary_data = []
for col in feature_cols_clean:
    n_unique = df[col].nunique()
    most_common = df[col].mode()[0]
    most_common_label = get_labels(col).get(most_common, 'Unbekannt')
    most_common_pct = (df[col] == most_common).mean() * 100
    
    summary_data.append({
        'feature': col,
        'n_kategorien': n_unique,
        'haeufigste_kategorie': most_common_label[:40],
        'anteil_pct': round(most_common_pct, 1)
    })

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(OUTPUT_DIR / 'feature_summary.csv', index=False)
print("Feature-Zusammenfassung:")
summary_df

## 4. Balkendiagramme erstellen

In [None]:
# Create plots directory
plots_dir = OUTPUT_DIR / 'plots'
plots_dir.mkdir(exist_ok=True)

def create_bar_chart(feature_name, freq_df, output_path):
    """Create horizontal bar chart for a feature with seaborn styling."""
    # Limit to max 20 categories for readability
    if len(freq_df) > 20:
        freq_df = freq_df.nlargest(20, 'count').copy()
        title_suffix = ' (Top 20)'
    else:
        freq_df = freq_df.copy()
        title_suffix = ''
    
    # Sort by count descending
    freq_df = freq_df.sort_values('count', ascending=False)
    
    # Truncate labels
    freq_df['label_short'] = freq_df['label'].apply(
        lambda x: x[:40] + '...' if len(x) > 40 else x
    )
    
    # Create figure with dynamic height
    fig_height = max(4, len(freq_df) * 0.45)
    fig, ax = plt.subplots(figsize=(10, fig_height))
    
    # Color palette
    colors = sns.color_palette("viridis", n_colors=len(freq_df))
    
    # Bar chart with seaborn
    sns.barplot(
        data=freq_df,
        y='label_short',
        x='percent',
        palette=colors,
        ax=ax,
        edgecolor='white',
        linewidth=0.5
    )
    
    # Add percentage labels
    for i, (pct, count) in enumerate(zip(freq_df['percent'], freq_df['count'])):
        if pct > 5:
            ax.text(pct - 0.5, i, f'{pct:.1f}%', va='center', ha='right', 
                    color='white', fontweight='bold', fontsize=9)
        else:
            ax.text(pct + 0.5, i, f'{pct:.1f}%', va='center', ha='left', 
                    color='#333', fontsize=9)
    
    ax.set_xlabel('Anteil (%)', fontsize=11)
    ax.set_ylabel('')
    ax.set_title(f'{feature_name}{title_suffix}', fontsize=12, fontweight='bold')
    ax.set_xlim(0, max(freq_df['percent']) * 1.12)
    
    # Clean up spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=100, bbox_inches='tight', facecolor='white')
    plt.close()

print(f"Erstelle Balkendiagramme...")

In [None]:
# Generate all bar charts
for feature_name in feature_cols_clean:
    freq_df = freq_tables[feature_name]
    output_path = plots_dir / f'bar_{feature_name}.png'
    create_bar_chart(feature_name, freq_df, output_path)

print(f"Gespeichert: {len(feature_cols_clean)} Balkendiagramme in {plots_dir}")

## 5. Wichtigste Features im Detail

In [None]:
# Key features to display inline
key_features = [
    'sprachgebiete',
    'grossregionen_der_schweiz',
    'stadt_land_typologie',
    'gemeindetypologie_9_typen',
    'urbanisierungsgrad_degurba_eurostat',
    'berggebiete',
    'agglomerationsgroessenklasse'
]

for feature in key_features:
    print(f"\n{'='*60}")
    print(f"{feature.upper()}")
    print('='*60)
    display(freq_tables[feature])

In [None]:
# Display key feature charts with seaborn - nicer styling
plot_features = [
    ('sprachgebiete', 'Sprachgebiete', 'Blues_d'),
    ('stadt_land_typologie', 'Stadt / Land', 'Greens_d'),
    ('grossregionen_der_schweiz', 'Grossregionen', 'Oranges_d'),
    ('urbanisierungsgrad_degurba_eurostat', 'Urbanisierungsgrad', 'Purples_d'),
    ('berggebiete', 'Berggebiete', 'RdYlGn'),
    ('agglomerationsgroessenklasse', 'Agglomerationsgroesse', 'YlOrRd_r')
]

fig, axes = plt.subplots(3, 2, figsize=(14, 11))
axes = axes.flatten()

for ax, (feature, title, palette) in zip(axes, plot_features):
    freq_df = freq_tables[feature].sort_values('count', ascending=False).copy()
    
    # Truncate labels
    freq_df['label_short'] = freq_df['label'].apply(lambda x: x[:28] + '...' if len(x) > 28 else x)
    
    # Create color palette based on values
    colors = sns.color_palette(palette, n_colors=len(freq_df))
    
    # Horizontal bar chart
    bars = sns.barplot(
        data=freq_df, 
        y='label_short', 
        x='percent',
        palette=colors,
        ax=ax,
        edgecolor='white',
        linewidth=0.5
    )
    
    # Add percentage labels inside bars
    for i, (pct, count) in enumerate(zip(freq_df['percent'], freq_df['count'])):
        if pct > 8:  # Only show inside if enough space
            ax.text(pct - 1, i, f'{pct:.1f}%', va='center', ha='right', 
                    color='white', fontweight='bold', fontsize=10)
        else:
            ax.text(pct + 1, i, f'{pct:.1f}%', va='center', ha='left', 
                    color='#333', fontsize=10)
    
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(title, fontsize=13, fontweight='bold', pad=10)
    ax.set_xlim(0, max(freq_df['percent']) * 1.15)
    
    # Remove spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.tick_params(bottom=False)
    ax.set_xticklabels([])

# Overall title
fig.suptitle('Gemeindemerkmale der Schweiz\nVerteilung nach Kategorien (n=2110 Gemeinden)', 
             fontsize=16, fontweight='bold', y=1.02)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'key_features_overview.png', dpi=150, bbox_inches='tight', 
            facecolor='white', edgecolor='none')
plt.show()
print(f"Gespeichert: {OUTPUT_DIR / 'key_features_overview.png'}")

## 6. Zusammenfassung

In [None]:
print("="*60)
print("EDA FEATURES - ZUSAMMENFASSUNG")
print("="*60)
print(f"Anzahl Gemeinden: {len(df)}")
print(f"Anzahl Features analysiert: {len(feature_cols_clean)}")
print()
print("Verteilung nach Kategorienanzahl:")
bins = [(2, 'Binaer (2)'), (3, '3 Kategorien'), (10, '4-10 Kategorien'), (float('inf'), '>10 Kategorien')]
for max_val, label in bins:
    if max_val == 2:
        count = sum(1 for col in feature_cols_clean if df[col].nunique() == 2)
    elif max_val == 3:
        count = sum(1 for col in feature_cols_clean if df[col].nunique() == 3)
    elif max_val == 10:
        count = sum(1 for col in feature_cols_clean if 4 <= df[col].nunique() <= 10)
    else:
        count = sum(1 for col in feature_cols_clean if df[col].nunique() > 10)
    print(f"  {label}: {count}")
print()
print("Gespeicherte Dateien:")
print(f"  - {OUTPUT_DIR / 'feature_frequencies.csv'}")
print(f"  - {OUTPUT_DIR / 'feature_summary.csv'}")
print(f"  - {OUTPUT_DIR / 'key_features_overview.png'}")
print(f"  - {OUTPUT_DIR / 'plots/'} ({len(feature_cols_clean)} Dateien)")
print("="*60)