In [None]:
! pip install plotnine


In [None]:
import pandas as pd
from plotnine import *


### Read and preprocess data


In [None]:
df = pd.read_csv("./data/19_data.csv", sep=";")
df.replace("BÜ90/GR", "BÜNDNIS`90/DIE GRÜNEN", inplace=True)
df.replace("DIE LINKE", "DIE LINKE.", inplace=True)
df.drop("Unnamed: 0", axis=1, inplace=True)  # index col
df.head()


Remove city information and only use 1st firstname


In [None]:
df["Name"] = df["Name"].map(lambda x: x.split("(")[0])
df["Vorname"] = df["Vorname"].map(lambda x: x.split(" ")[0])
df.shape


If someone switched fraction, merge values


In [None]:
df = df.groupby(["Name", "Vorname"]).apply(lambda x: x.ffill().bfill()).drop_duplicates(subset=["Name", "Vorname"]).reset_index().drop("index", axis=1)
df.shape


### Pipeline


allowed values for votes: "Ja" = _yes_, "Nein" = _no_, "Enthaltung" = _abstention_, "Ungültig" = _invalid_, "Abwesend" = _not voted_ or NaN


Columns containing the votes are named with the following scheme: {Period}-{Session}-{Poll}


In [None]:
vote_cols = [c for c in df.columns if "-" in c]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA

In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("preprocess_vote", Pipeline(
            steps=[
                ('imputer', SimpleImputer(strategy="constant", fill_value="Abwesend")),
                ('onehot', OneHotEncoder(handle_unknown="ignore"))
            ]
        ), vote_cols)
    ],
    sparse_threshold=0)


Use tSNE for Visualization


In [None]:

tsne = TSNE(n_components=2)
mapping = tsne.fit_transform(preprocess.fit_transform(df))


In [None]:
df_mapping = pd.DataFrame(mapping, columns=["x", "y"])
df_mapping["name"] = df["Vorname"] + " " + df["Name"]
df_mapping["party"] = df["Fraktion/Gruppe"]
df_mapping.head(20)


In [None]:
colors = {
    "AfD": '#009ee0',
    "FDP": '#ffff00',
    "CDU/CSU": '#000000',
    "SPD": '#E3000F',
    "BÜNDNIS`90/DIE GRÜNEN": '#46962b',
    "DIE LINKE.": '#BE3075',
    "Fraktionslos": "orange"
}


In [None]:
ggplot(df_mapping, aes(x="x", y="y", label="name")) + geom_point(aes(colour="party")) + scale_color_manual(values=colors)


### Set Labels and Data

In [None]:
X = df[vote_cols]
y = df["Fraktion/Gruppe"]

### Create classifier pipeline with KNN

In [None]:
classifier = Pipeline(
    steps=[
        ('pca', PCA()),
        ('knn', KNeighborsClassifier())
    ]
)

In [None]:
clf = Pipeline(
    steps=[
        ('preprocess', preprocess),        
        ("classifier", classifier)
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
params = {
    "classifier__knn__n_neighbors": [2, 3, 5, 7, 9, 11, 13, 15],
    "classifier__pca__n_components": [2, 3, 4, 5, 6]
}


In [None]:
gs = GridSearchCV(clf, params, cv=10).fit(X_train, y_train)

In [None]:
print("{:.2%}".format(gs.best_score_))

In [None]:
gs.best_params_