# Analyse auf basis des Wahl-o-mat Datensatzes

### Imports

In [24]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import sklearn.cluster as cluster
from sklearn.metrics import normalized_mutual_info_score
import warnings
import dash
from dash import dcc, html
import plotly.io as pio

warnings.filterwarnings("ignore")

## 2025

### Daten einlesen

In [25]:
file = 'data/2025.csv'

df = pd.read_csv(file, sep=';')
df['Jahr'] = file.split('.')[0]


In [26]:
def pivot_df(df):
    df_fixed = df.pivot_table(index='Partei: Kurzbezeichnung', columns='These: Titel', values='Position: Position', aggfunc='first')
    df_fixed.replace('stimme zu', 1, inplace=True)
    df_fixed.replace('neutral', 0.5, inplace=True)
    df_fixed.replace('stimme nicht zu', 0, inplace=True)
    return df_fixed


In [27]:
df_fixed = pivot_df(df)


### PCA

In [59]:
def plot_heatmap(mat, x_labels, y_labels):
    fig = go.Figure(data=go.Heatmap(
        z=mat,
        x=x_labels,
        y=y_labels,
        colorscale='Plasma'))
    
    fig.update_layout(
        title='Covariance Heatmap',
    )    
    return fig
    

In [60]:
def plot_pcs(pcs, lables):
    fig = go.Figure(data=go.Heatmap(
        z=pcs,
        y=lables,
        x=[f'PC {i+1}' for i in range(pcs.shape[1])],
        colorscale='Plasma'))
    
    fig.update_layout(
        title='Principal Components'
        )
    
    return fig

In [61]:
def plot_cumcum(values):
    values = values.cumsum()/values.sum()
    fig = go.Figure(data=go.Scatter(x=list(range(1, len(values)+1)), y=values, mode='lines+markers'))
    fig.update_layout(
        title='Cumulative Explained Variance'
        )
    return fig

In [62]:
def plot_pca1d(points, labels, colors=None):
    fig = go.Figure(data=go.Scatter(x=points[:,0], y=[0]*len(points), mode='markers+text', text=labels, textposition='top center', marker=dict(color=colors)))
    fig.update_layout(
        title='PCA 1D'
        )
    return fig

In [63]:
def plot_pca2d(points, labels, colors=None):
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=points[:,0], y=points[:,1], mode='markers+text', text=labels, textposition='top center', marker=dict(color=colors)))
    fig.update_layout(
        title='PCA 2D'
        )
    
    return fig

In [64]:
def plot_pca3d(points, labels,colors=None):
    fig = go.Figure()

    fig.add_trace(go.Scatter3d(x=points[:,0], y=points[:,1], z=points[:,2], mode='markers+text', text=labels, textposition='top center', marker=dict(color=colors)))
    fig.update_layout(
        title='PCA 3D'
        )
    
    return fig

In [65]:
mean = df_fixed.mean(axis=0)
std = df_fixed.std(axis=0)
std[std==0] = 1

data_norm = (df_fixed - mean) / std

cov = np.cov(data_norm.T)
eig_vals, eig_vecs = np.linalg.eig(cov)
eig_vals = np.real(eig_vals)
eig_vecs = np.real(eig_vecs)


sort = np.argsort(eig_vals)[::-1]
eig_vals = eig_vals[sort]
eig_vecs = eig_vecs[:,sort]


pc_vecs_1d = eig_vecs[:,:1]
pc_vecs_2d = eig_vecs[:,:2]
pc_vecs_3d = eig_vecs[:,:3]

pcs_1d = data_norm @ pc_vecs_1d
pcs_2d = data_norm @ pc_vecs_2d
pcs_3d = data_norm @ pc_vecs_3d


plt = plot_cumcum(eig_vals)
plt.write_html('plots/cumsum_2025.html', include_plotlyjs='cdn', full_html=False)
plt.show()

plt = plot_heatmap(cov, df_fixed.columns, df_fixed.columns)
plt.write_html('plots/heatmap_2025.html', include_plotlyjs='cdn', full_html=False)
plt.show()

plt = plot_pcs(pc_vecs_3d, df_fixed.columns)
plt.write_html('plots/pcs_2025.html', include_plotlyjs='cdn',full_html=False)
plt.show()

fig = plot_pca1d(pcs_1d.values, df_fixed.index, ['black'] * pcs_1d.shape[0])
fig.write_html('plots/pca1d_2025.html',include_plotlyjs='cdn', full_html=False)
fig.show()

fig = plot_pca2d(pcs_2d.values, df_fixed.index, ['black'] * pcs_2d.shape[0])
fig.write_html('plots/pca2d_2025.html',include_plotlyjs='cdn', full_html=False)
fig.show()

plt = plot_pca3d(pcs_3d.values, df_fixed.index, ['black'] * pcs_2d.shape[0])
plt.write_html('plots/pca3d_2025.html',include_plotlyjs='cdn', full_html=False)
plt.show()


### Max Covariance

In [35]:
top_indices = np.unravel_index(np.argsort(cov, axis=None)[::-1], cov.shape)

top_entries = [(df_fixed.columns[i], df_fixed.columns[j], cov[i,j]) for i, j in zip(*top_indices) if cov[i, j]]
top_10 = top_entries[38:58:2]
bottom_10 = top_entries[-20::2]

print('top entries: \n', '\n'.join([f'{i[0]} - {i[1]} : {i[2]:.3f}' for i in top_10]), '\n')
print('bottom entries: \n', '\n'.join([f'{i[0]} - {i[1]} : {i[2]:.3f}' for i in bottom_10[::-1]]))



top entries: 
 Kontrolle von Zulieferern - Anhebung des Spitzensteuersatzes : 0.902
Anhebung des Spitzensteuersatzes - Zweite Staatsbürgerschaft : 0.897
Schwangerschaftsabbruch nach Beratung - Grundgesetz : 0.892
Begrenzung der Mietpreise - Erhöhung des Mindestlohns : 0.884
Krankenkassen - Kontrolle von Zulieferern : 0.869
Krankenkassen - Schiene vor Straße : 0.854
Kontrolle von Zulieferern - Schiene vor Straße : 0.834
Abweisung Asylsuchender - Umlegung der Grundsteuer : 0.824
Ökologische Landwirtschaft - Tempolimit auf Autobahnen : 0.823
Anhebung des Spitzensteuersatzes - Erhöhung des Mindestlohns : 0.820 

bottom entries: 
 Projekte gegen Rechtsextremismus - Verwerfen der Klimaziele : -1.000
Abweisung Asylsuchender - Kontrolle von Zulieferern : -0.879
Schuldenbremse - Anhebung des Spitzensteuersatzes : -0.874
Automatisierte Gesichtserkennung - Arbeitserlaubnis für Asylsuchende : -0.865
Krankenkassen - Nutzung der Kernenergie : -0.835
Fossile Brennstoffe - Kontrolle von Zulieferern : 

### Fixed size clustering (KMeans)

In [52]:
DIMS = 38
K = 7

points = data_norm @ pc_vecs_2d[:,:DIMS]

best_c = None
best_m = 10e10
for i in range(1000):
    clusterer = cluster.KMeans(n_clusters=K)
    labels = clusterer.fit_predict(points)
    avg_sq_dist = np.mean([np.linalg.norm(points[labels==i] - np.mean(points[labels==i], axis=0))**2 for i in range(K)])
    if avg_sq_dist < best_m:
        best_m = avg_sq_dist
        best_c = labels


unique_labels = np.unique(best_c)
colors = [f'rgb({r},{g},{b})' for r, g, b in np.random.randint(0, 255, size=(len(unique_labels), 3))]

fig = plot_pca2d(points.values, df_fixed.index, [colors[label] for label in best_c]).show()
plot_pca2d(pcs_2d.values, df_fixed.index, [colors[label] for label in labels]).write_html('plots/pca2d_clustered_kmeans.html',include_plotlyjs='cdn', full_html=False)