In [None]:
! pip install plotnine


In [None]:
import pandas as pd
from plotnine import *


### Read and preprocess data


In [None]:
df = pd.read_csv("./data/20_data.csv", sep=";")
df.replace("BÜ90/GR", "BÜNDNIS`90/DIE GRÜNEN", inplace=True)
df.replace("DIE LINKE", "DIE LINKE.", inplace=True)
df.drop("Unnamed: 0", axis=1, inplace=True)  # index col
df.head()


Remove city information and only use 1st firstname


In [None]:
df["Name"] = df["Name"].map(lambda x: x.split("(")[0])
df["Vorname"] = df["Vorname"].map(lambda x: x.split(" ")[0])
df.shape


If someone switched fraction, merge values


In [None]:
df = df.groupby(["Name", "Vorname"]).apply(lambda x: x.ffill().bfill()).drop_duplicates(subset=["Name", "Vorname"]).reset_index().drop("index", axis=1)
df.shape


### Pipeline


allowed values for votes: "Ja" = _yes_, "Nein" = _no_, "Enthaltung" = _abstention_, "Ungültig" = _invalid_, "Abwesend" = _not voted_ or NaN


Columns containing the votes are named with the following scheme: {Period}-{Session}-{Poll}


In [None]:
vote_cols = [c for c in df.columns if "-" in c]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.manifold import TSNE


In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("preprocess_vote", Pipeline(
            steps=[
                ('imputer', SimpleImputer(strategy="constant", fill_value="Abwesend")),
                ('onehot', OneHotEncoder())
            ]
        ), vote_cols)
    ],
    sparse_threshold=0)
df_preprocessed = preprocess.fit_transform(df)


Use tSNE for Visualization


In [None]:

tsne = TSNE(n_components=2)
mapping = tsne.fit_transform(df_preprocessed)


In [None]:
df_mapping = pd.DataFrame(mapping, columns=["x", "y"])
df_mapping["name"] = df["Vorname"] + " " + df["Name"]
df_mapping["party"] = df["Fraktion/Gruppe"]
df_mapping.head(20)


In [None]:
colors = {
    "AfD": '#009ee0',
    "FDP": '#ffff00',
    "CDU/CSU": '#000000',
    "SPD": '#E3000F',
    "BÜNDNIS`90/DIE GRÜNEN": '#46962b',
    "DIE LINKE.": '#BE3075',
    "Fraktionslos": "#333333"
}


In [None]:
ggplot(df_mapping, aes(x="x", y="y", label="name")) + geom_point(aes(colour="party")) + scale_color_manual(values=colors)
