In [75]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import scipy.stats as stats
colormap = {'A': '#A82721', 
            'O': '#EAC73E', 
            'V': '#254264', 
            'Ø': '#E6801A', 
            'I': '#3FB2BE', 
            'Å': '#2B8738', 
            'B': '#733280', 
            'F': '#E07EA8', 
            'C': '#96B226', 
            'D': '#127B7F', 
            'K': '#8B8474', 
            'Q': '#C49414', 
            'M': '#832B93', 
            'Æ': '#2C5877'}


In [59]:
# df = pd.read_csv('../data/kandidater_data.csv').dropna()
# df = df.assign(new_column=df["Gender"].map({"M": 0, "F": 2.0}))
# df.rename(columns = {"new_column":"26"}, inplace = True)
# df['age'] = 2022 - df.Birthdate.str[:4].astype(int)

In [108]:
df = pd.read_csv('../data/kandidater_data.csv').dropna()
df = df.assign(
    new_column=df["Gender"].map(
        lambda x: 0 if x == "male" else random.choices([2, 0], weights=[0.5, 0.5])[0]
    )
)
df.rename(columns = {"new_column":"26"}, inplace = True)
df['age'] = 2022 - df.Birthdate.str[:4].astype(int)

In [109]:
clf = LogisticRegression()

ct = ColumnTransformer([
    ('scaler', MinMaxScaler(), ['age']),
    ('one hot', OneHotEncoder(), ['CurrentPartyCode', 'Profession', 'Education'])
])

pipe = Pipeline([
    ('column transformer', ct), 
    ('logistic reg', clf)
])


In [110]:
y = df.Gender.map({'M': 1, 'F': 0}).to_numpy()

In [111]:
pipe.fit(df, y)
pipe.predict_proba(df)

array([[0.64619073, 0.35380927],
       [0.12049453, 0.87950547],
       [0.31236972, 0.68763028],
       ...,
       [0.56692261, 0.43307739],
       [0.38244522, 0.61755478],
       [0.32679194, 0.67320806]])

In [112]:
propensity = pipe.predict_proba(df)[:, -1]
df['propensity'] = propensity

In [113]:
knn = NearestNeighbors(n_neighbors=1)
male = df[df.Gender=='M']
female = df[df.Gender=='F']

x = ct.transform(male.iloc[:, :-1])
knn.fit(x)

NearestNeighbors(n_neighbors=1)

In [114]:
x = ct.transform(female.iloc[:, :-1])
neigh_dist, neigh_ind = knn.kneighbors(x)

In [115]:
s1 = female
s2 = male.iloc[neigh_ind.reshape(-1)]

In [116]:
pca = PCA(n_components=2)
pca.fit(df[[str(i) for i in range(1, 27)]])

xs1 = pca.transform(s1[[str(i) for i in range(1, 27)]])
xs2 = pca.transform(s2[[str(i) for i in range(1, 27)]])

In [118]:
a, p_values = stats.ttest_rel(s1[[str(i) for i in range(1, 27)]], s2[[str(i) for i in range(1, 27)]])

In [119]:
np.set_printoptions(suppress=True)
np.where(p_values < 0.002)

(array([11, 15], dtype=int64),)