In [1]:
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, auc
import matplotlib.pyplot as plt

In [2]:
data=pd.read_spss("Marie.sav")

In [3]:
len(data) #

4169

In [4]:
data.head(3) #

Unnamed: 0,alter,Quote,Schultyp,fz001,fz002,fz003,fz004,fz005,fz006,fz007,...,ges_wiss_equal,ges_wiss_weight,Zges_cogn_equal,Zges_cogn_weight,Zges_wiss_equal,Zges_wiss_weight,Ztv_rel,ges_equal,ges_weight,sel
0,25.0,Österreich,Gymnasium,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.991667,0.991667,2.440918,2.472499,3.627508,3.548649,1.714311,2.842893,2.82714,aufgenommen
1,20.0,EU,Gymnasium,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.866667,0.875,2.432228,2.472499,2.788739,2.777443,1.714311,2.503041,2.518658,aufgenommen
2,19.0,Österreich,Realgymnasium,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.8925,0.875,2.466349,2.472499,2.962085,2.777443,1.283875,2.546396,2.475614,aufgenommen


In [4]:
datanew=data.loc[(data["alter"]>=17)&(data["alter"]<=40)]

In [6]:
datanew["alter"] #

0       25.0
1       20.0
2       19.0
3       20.0
4       21.0
        ... 
4164    22.0
4165    19.0
4166    19.0
4167    18.0
4168    27.0
Name: alter, Length: 4147, dtype: float64

In [5]:
datanew=datanew[["alter","Quote","Schultyp","fz_score", "gm_score", "md_score", 
         "bi_score", "ch_score", "ph_score", "ma_score", "tv_score", "sel"]]

In [8]:
datanew["sel"].value_counts() #wie viel wurden aufgenommen #

nicht aufgenommen    3488
aufgenommen           659
Name: sel, dtype: int64

In [9]:
datanew.isna().sum() #fehlende werte #

alter        0
Quote        0
Schultyp     0
fz_score     0
gm_score     0
md_score     0
bi_score    11
ch_score    11
ph_score    11
ma_score    11
tv_score    11
sel          0
dtype: int64

In [10]:
datanew[datanew.isna().any(axis=1)] #fehlende werte aufgelistet #

Unnamed: 0,alter,Quote,Schultyp,fz_score,gm_score,md_score,bi_score,ch_score,ph_score,ma_score,tv_score,sel
3786,18.0,Österreich,Gymnasium,18.0,13.0,4.0,,,,,,nicht aufgenommen
3879,19.0,Österreich,H.techn.u.gewerbl. Lehranstalt,16.0,7.0,8.0,,,,,,nicht aufgenommen
4032,24.0,EU,Gymnasium,15.0,6.0,10.0,,,,,,nicht aufgenommen
4135,23.0,Österreich,Gymnasium,9.0,8.0,6.0,,,,,,nicht aufgenommen
4143,19.0,Österreich,Neusprachliches Gymnasium,12.0,11.0,6.0,,,,,,nicht aufgenommen
4158,35.0,,99.0,4.0,8.0,7.0,,,,,,nicht aufgenommen
4162,20.0,EU,Gymnasium,12.0,4.0,4.0,,,,,,nicht aufgenommen
4164,22.0,Österreich,Gymnasium,9.0,4.0,5.0,,,,,,nicht aufgenommen
4165,19.0,EU,Gymnasium,6.0,9.0,4.0,,,,,,nicht aufgenommen
4166,19.0,Österreich,Naturwissensch. Realgymnasium,11.0,5.0,4.0,,,,,,nicht aufgenommen


In [11]:
datanew.dropna().isna().sum() #

alter       0
Quote       0
Schultyp    0
fz_score    0
gm_score    0
md_score    0
bi_score    0
ch_score    0
ph_score    0
ma_score    0
tv_score    0
sel         0
dtype: int64

In [6]:
datanew=datanew.dropna() #alle fehlenden werte entfernt

In [7]:
data_mod=datanew.loc[datanew["Quote"]!=""] #nur zahlen ohne leere quote

In [14]:
len(data_mod)

4133

In [15]:
sel = data_mod["sel"]=="aufgenommen" #aus sel variable - true, false 

In [8]:
x=data_mod[["alter","Quote","fz_score", "gm_score", "md_score", 
         "bi_score", "ch_score", "ph_score", "ma_score", "tv_score"]]#Prädiktoren

In [22]:
x.head(5)

Unnamed: 0,alter,Quote,fz_score,gm_score,md_score,bi_score,ch_score,ph_score,ma_score,tv_score
0,25.0,Österreich,21.0,19.0,11.0,50.0,29.0,20.0,20.0,24.0
1,20.0,EU,19.0,17.0,12.0,45.0,26.0,16.0,18.0,24.0
2,19.0,Österreich,18.0,19.0,12.0,41.0,27.0,18.0,19.0,22.0
3,20.0,Österreich,21.0,19.0,13.0,41.0,22.0,11.0,18.0,23.0
4,21.0,Österreich,21.0,19.0,12.0,44.0,24.0,18.0,19.0,23.0


In [9]:
cat_cols = x.select_dtypes("object").columns
num_cols = x.select_dtypes(["int64", "float64"]).columns

cat_transformed = pd.get_dummies(x[cat_cols]) #kategorische spalte one hot encoden
num_transformed = x[num_cols] #pd.DataFrame(StandardScaler().fit_transform(x[num_cols]), columns=num_cols)

### Neu

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(15,15))
fig.patch.set_facecolor('white')
sns.heatmap(num_transformed.corr(), ax=ax, annot=True, cmap="RdBu")

In [None]:
fig, axs = plt.subplots(3,3, figsize=(20,20))
fig.patch.set_facecolor('white')
attributes = num_transformed.columns
att = 0

for i in range(3):
    for j in range(3):
        try:
            sns.histplot(x=attributes[att], data=num_transformed, ax=axs[i][j])
            # sns.histplot(x=attributes[att], data=log_data, ax=axs[i][j], color="red")
        except:
            print("Done")
        att += 1

In [None]:
fig, axs = plt.subplots(3,3, figsize=(20,20))
fig.patch.set_facecolor('white')
attributes = x.columns
att = 0
for i in range(3):
    for j in range(3):
        try:
            sns.barplot(x="quote", y=attributes[att], data=x, estimator=np.mean, ax=axs[i][j])
        except:
            print("Done")
        att += 1

In [25]:
cat_transformed.head(5)

Unnamed: 0,Quote_EU,Quote_nicht EU,Quote_Österreich
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [10]:
cat_transformed

Unnamed: 0,Quote_EU,Quote_nicht EU,Quote_Österreich
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
4159,0,1,0
4160,0,0,1
4161,0,1,0
4163,0,0,1


In [11]:
num_transformed

Unnamed: 0,alter,fz_score,gm_score,md_score,bi_score,ch_score,ph_score,ma_score,tv_score
0,25.0,21.0,19.0,11.0,50.0,29.0,20.0,20.0,24.0
1,20.0,19.0,17.0,12.0,45.0,26.0,16.0,18.0,24.0
2,19.0,18.0,19.0,12.0,41.0,27.0,18.0,19.0,22.0
3,20.0,21.0,19.0,13.0,41.0,22.0,11.0,18.0,23.0
4,21.0,21.0,19.0,12.0,44.0,24.0,18.0,19.0,23.0
...,...,...,...,...,...,...,...,...,...
4159,28.0,9.0,5.0,2.0,8.0,9.0,6.0,0.0,2.0
4160,35.0,11.0,7.0,3.0,9.0,0.0,0.0,0.0,6.0
4161,28.0,4.0,6.0,1.0,12.0,4.0,1.0,5.0,4.0
4163,19.0,4.0,6.0,3.0,12.0,7.0,3.0,5.0,6.0


In [12]:
x_transformed = cat_transformed.merge(num_transformed, left_index = True, right_index = True)

In [13]:
x_transformed

Unnamed: 0,Quote_EU,Quote_nicht EU,Quote_Österreich,alter,fz_score,gm_score,md_score,bi_score,ch_score,ph_score,ma_score,tv_score
0,0,0,1,25.0,21.0,19.0,11.0,50.0,29.0,20.0,20.0,24.0
1,1,0,0,20.0,19.0,17.0,12.0,45.0,26.0,16.0,18.0,24.0
2,0,0,1,19.0,18.0,19.0,12.0,41.0,27.0,18.0,19.0,22.0
3,0,0,1,20.0,21.0,19.0,13.0,41.0,22.0,11.0,18.0,23.0
4,0,0,1,21.0,21.0,19.0,12.0,44.0,24.0,18.0,19.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4159,0,1,0,28.0,9.0,5.0,2.0,8.0,9.0,6.0,0.0,2.0
4160,0,0,1,35.0,11.0,7.0,3.0,9.0,0.0,0.0,0.0,6.0
4161,0,1,0,28.0,4.0,6.0,1.0,12.0,4.0,1.0,5.0,4.0
4163,0,0,1,19.0,4.0,6.0,3.0,12.0,7.0,3.0,5.0,6.0


# training

In [16]:
X_train, X_test, y_train, y_test = train_test_split(np.array(x_transformed["alter"]).reshape(-1,1), sel, test_size=0.2, random_state=0)
#trainings&testdatensatz erstellen

In [39]:
x_transformed[4].shape

(4133,)

In [41]:
np.array(x_transformed[4]).reshape(-1,1).shape

(4133, 1)

In [17]:
log_reg = LogisticRegression(solver="liblinear")
rf = RandomForestClassifier(random_state = 0)
k_neighbors = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier() #initialisierung der modelle

In [25]:
log_reg_params = {"penalty": ["l1", "l2"], "max_iter":[300]}

decision_tree_params = {"max_depth": [2,3,4]}

rf_params = {"n_estimators":[50, 100, 150], 
             "max_depth": range(2,6),
             "min_samples_split": [2,4,6]}

k_neighbors_params = {"n_neighbors": range(3,30),
                      "weights": ["uniform","distance"]}

perceptron_params = {"penalty": ["l1", "l2"]} #parameter der modelle

In [19]:
def evaluate(estimator, params):
    cv = GridSearchCV(estimator=estimator,
                      cv=3,
                      param_grid=params).fit(X_train, y_train) #scoring="roc_auc" 
    
    print(cv.score(X_test, y_test))
    
    # give back the probability for label 1 
    prob_one = [i[1] for i in cv.predict_proba(X_test)]
    
    # get the false postitive and true positive rate
    fpr, tpr, thresholds = roc_curve(y_test, prob_one)
    
    name = str(estimator).split("(")[0]
    
    # display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
    #                                   estimator_name=name)
    # display.plot()
    
    
    return cv, fpr, tpr # funktion definieren

In [None]:
for i in range(10):
   # X_train, X_test, y_train, y_test = train_test_split(np.array(x_transformed["alter"]).reshape(-1,1), sel, test_size=0.4, stratify = sel)
    #trainings&testdatensatz erstellen
    X_train, X_test, y_train, y_test = train_test_split(x_transformed, sel, test_size=0.4, stratify = sel)
    cv,_,_ = evaluate(log_reg, log_reg_params)
    print(cv.best_estimator_.coef_)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# setting up our figure
fig, axs = plt.subplots(figsize=(10,10))
fig.patch.set_facecolor("grey")

# getting our predictions and the values for the confusion matrices
# y_pred = cv.best_estimator_.predict(X_test)
# conf_mat = confusion_matrix(y_pred=y_pred, y_true=y_test, labels=["aufgenommen", "nicht aufgenommen"])
ConfusionMatrixDisplay.from_estimator(cv.best_estimator_, X_test, y_test, labels=["aufgenommen", "nicht aufgenommen"])

In [35]:
x_transformed.head(2)

Unnamed: 0,Quote_EU,Quote_nicht EU,Quote_Österreich,alter,fz_score,gm_score,md_score,bi_score,ch_score,ph_score,ma_score,tv_score
0,0,0,1,25.0,21.0,19.0,11.0,50.0,29.0,20.0,20.0,24.0
1,1,0,0,20.0,19.0,17.0,12.0,45.0,26.0,16.0,18.0,24.0


In [37]:
data_mod["Quote"].value_counts()

Österreich    2744
EU            1285
nicht EU       104
Name: Quote, dtype: int64

In [68]:
sum(sel)/len(sel)

0.15944834260827487

In [None]:
pd.DataFrame(cv.cv_results_)

In [None]:
eval_params = [(rf, rf_params), 
               (k_neighbors, k_neighbors_params),
               (decision_tree, decision_tree_params),
               (log_reg, log_reg_params)]  

fig, axs = plt.subplots(1, 4, figsize=(24,6))
fig.patch.set_facecolor('white')


for i, (est, params) in enumerate(eval_params):
    cv, fpr, tpr = evaluate(est, params)

    fig, ax = plt.subplots(figsize=(10,10))

    roc_auc = auc(fpr, tpr)    
    
    # axs[i].plot(fpr, tpr)
    # axs[i].set_xlabel("False Positive Rate")
    # axs[i].set_ylabel("True Positive Rate")
    # axs[i].title.set_text("Model:" + str(est).split("(")[0] + " AUC: " + str(round(roc_auc, 3)))
    
    
    ax.plot(fpr, tpr)
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.title.set_text("Model:" + str(est).split("(")[0] + " AUC: " + str(round(roc_auc, 3)))
    
    
    
    print("Best Parameters", cv.best_estimator_)
    print("Roc-Auc Score", roc_auc)