# Version 1 mit metrischen und kategorialen Variablen

Import der benutzen Bibliotheken

In [None]:
# Arbeitsbibliotheken
import numpy as np
import pandas as pd
import time
# Visualisierungsbibliotheken
import matplotlib.pyplot as plt
import seaborn as sns
# Preprocessing/Evaluation
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_roc_curve
# Mashine Learnung Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
# Ensemble Learning Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
# Feature Importance
from sklearn.inspection import permutation_importance

In [None]:
# Laden der in Vorbereitung gespeicherten Datensätze
censusdatatrain = pd.read_csv("adult.data", index_col=0)
censusdatatest = pd.read_csv("adult.test", index_col=0)

Preprocessing

In [None]:
# Train-Test-Split
# Da der Datensatz bereits einen vorgefertigten Split mitliefert, wird auf Train_Test_Split verzichtet
# Trainingsdaten
X_train = censusdatatrain.drop("target", axis=1) 
y_train = censusdatatrain["target"].replace(" >50K",1).replace(" <=50K",0)
# Testdaten
X_test = censusdatatest.drop("target", axis=1)
y_test = censusdatatest["target"].replace(" >50K.",1).replace(" <=50K.",0)

In [None]:
# Preprocessing der Training- und Test-Daten
for df in [X_train,X_test]:
# workclass
    df["workclass"] = df["workclass"].replace(to_replace=[" Self-emp-not-inc", " Self-emp-inc"], value="Selfemp")
    df["workclass"] = df["workclass"].replace(to_replace=[" Local-gov", " State-gov", " Federal-gov"], value="Goverm")
    df["workclass"] = df["workclass"].replace(to_replace=[" ?", " Without-pay", " Never-worked"], value="Residualwc")
# native country
    df["native-country"] = [1 if x==" United-States" or x==" Outlying-US(Guam-USVI-etc)" else 0 for x in df["native-country"]]
# marital-status
    df["marital-status"] = df["marital-status"].replace(to_replace=[" Married-civ-spouse"," Married-spouse-absent"," Married-AF-spouse"], value="married")
    df["marital-status"] = df["marital-status"].replace(to_replace=[" Divorced"," Seperated"," Widowed"], value="seperated")
# race
    df["race"] = df["race"].replace(to_replace=[" Other"," Amer-Indian-Eskimo"," Asian-Pac-Islander"], value="other_race")
# occupation
    df["occupation"] = df["occupation"].replace(to_replace=[" Armed-Forces", " Protective-serv"], value="Security")
    df["occupation"] = df["occupation"].replace(to_replace=[" Other-service", " Priv-house-serv", " ?"], value="Other_Services")
# sex
    df["sex"] = [0 if x == " Male" else 1 for x in df["sex"]]
# Drop 
    df.drop(["fnlwgt", "education", "relationship"], axis=1, inplace=True)
# Dummy-Variablen
X_train = pd.concat([X_train.drop("workclass", axis=1),pd.get_dummies(X_train["workclass"])], axis=1)
X_train = pd.concat([X_train.drop("occupation", axis=1),pd.get_dummies(X_train["occupation"])], axis=1)
X_train = pd.concat([X_train.drop("marital-status", axis=1),pd.get_dummies(X_train["marital-status"])], axis=1)
X_train = pd.concat([X_train.drop("race", axis=1),pd.get_dummies(X_train["race"])], axis=1)
X_test = pd.concat([X_test.drop("marital-status", axis=1),pd.get_dummies(X_test["marital-status"])], axis=1)
X_test = pd.concat([X_test.drop("workclass", axis=1),pd.get_dummies(X_test["workclass"])], axis=1)
X_test = pd.concat([X_test.drop("occupation", axis=1),pd.get_dummies(X_test["occupation"])], axis=1) 
X_test = pd.concat([X_test.drop("race", axis=1),pd.get_dummies(X_test["race"])], axis=1) 
# Drop Dummy-Variablen
for df in [X_train,X_test]:
    df.drop(["Residualwc", "Other_Services", " Never-married", " White"], axis=1, inplace=True)

In [None]:
# Heatmap und Korrelationsmatrix zur Überprüfung von Multikollinearität
for x in [X_train, X_test]:
    pd.set_option('display.max_columns', None)
    display(x.corr())
for x in [X_train, X_test]:
    plt.figure(figsize=(10,10))
    sns.heatmap(x.corr())

In [None]:
# Preprocessing: Normalisieren der Daten
features = X_train.columns
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Maschine Learning Algorithmen

In [None]:
# Modelbuilding Mashine-Learning Classifier
# Decision Tree
print("Decision Tree")
start_time = time.time()
dtc = DecisionTreeClassifier() 
tree_para = {'criterion':['gini','entropy'],'max_depth':[i for i in range(2,23)], 'min_samples_split':[i for i in range (2,23)]}
grd_clf = HalvingGridSearchCV(dtc, tree_para, scoring="f1", cv=5, n_jobs=-1)
grd_clf.fit(X_train, y_train)
model_with_best_tree_parameters = grd_clf.best_estimator_
btmodel = model_with_best_tree_parameters.fit(X_train,y_train)
print(f'{"Akkuranz: " }{cross_val_score(btmodel,X_train,y_train,scoring="accuracy",cv=10, n_jobs=-1).mean():.4f}')
print("Modellparameter:", btmodel.get_params())
y_predict = btmodel.predict(X_test)
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# Gaussian Naive Bayes 
print("Gaussian Naive Bayes")
start_time = time.time()
gnb = GaussianNB()
gnbmodel = gnb.fit(X_train,y_train)
print(f'{"Akkuranz: " }{cross_val_score(gnbmodel,X_train,y_train,scoring="accuracy",cv=5, n_jobs=-1).mean():.4f}')
print("Modellparameter:", gnbmodel.get_params())
y_predict = gnbmodel.predict(X_test)
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# Support Vector Mashine
print("Support Vector Mashine")
start_time = time.time()
svm = SVC()
svm_para = {'kernel':['linear', 'poly', 'rbf', 'sigmoid']}
grd_clf = HalvingGridSearchCV(svm, svm_para, scoring="f1", cv=5, n_jobs=-1)
grd_clf.fit(X_train, y_train)
model_with_best_svm_parameters = grd_clf.best_estimator_
bsvmmodel = model_with_best_svm_parameters.fit(X_train,y_train)
print(f'{"Akkuranz: " }{cross_val_score(bsvmmodel,X_train,y_train,scoring="accuracy",cv=10, n_jobs=-1).mean():.4f}')
print("Modellparameter:", bsvmmodel.get_params())
y_predict = bsvmmodel.predict(X_test)
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# KNeighbor Classifier
print("KNeighbor Classifier")
start_time = time.time()
knc = KNeighborsClassifier()
k_para = {"weights" : ['uniform', 'distance'], "leaf_size" : [i for i in np.random.randint(2,80,15)], "n_neighbors": [i for i in np.random.randint(2,40,5)]}
k_grd_clf = HalvingGridSearchCV(knc, k_para, scoring="f1", cv=5, n_jobs=-1)
k_grd_clf.fit(X_train,y_train)
model_with_best_k_parameters = k_grd_clf.best_estimator_
bkmodel = model_with_best_k_parameters.fit(X_train,y_train)
print(f'{"Akkuranz: " }{cross_val_score(bkmodel,X_train,y_train,scoring="accuracy",cv=10, n_jobs=-1).mean():.4f}')
print("Modellparameter:", bkmodel.get_params())  
y_predict = bkmodel.predict(X_test)
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# Ridge Classifier
print("Ridge Classifier")
start_time = time.time()
rc = RidgeClassifier()
rcmodel=rc.fit(X_train,y_train)
y_predict = rcmodel.predict(X_test)
print(f'{"Akkuranz: " }{cross_val_score(rcmodel,X_train,y_train,scoring="accuracy",cv=10, n_jobs=-1).mean():.4f}')
print(rcmodel.get_params())  
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# Logistic Regression Classifier
print("Logistic Regression Classifier")
start_time = time.time()
lr = LogisticRegressionCV(cv=5)
lrmodel = lr.fit(X_train,y_train)
y_predict = lrmodel.predict(X_test)
print(f'{"Akkuranz: " }{cross_val_score(lrmodel,X_train,y_train,scoring="accuracy",cv=10, n_jobs=-1).mean():.4f}')
print("Modellparameter:", lrmodel.get_params())  
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# Linear Discriminant Classifier
print("Linear Discriminant Classifier")
start_time = time.time()
lda = LinearDiscriminantAnalysis()
ldamodel = lda.fit(X_train,y_train)
y_predict = ldamodel.predict(X_test)
print(f'{"Akkuranz: " }{cross_val_score(ldamodel,X_train,y_train,scoring="accuracy",cv=10, n_jobs=-1).mean():.4f}')
print("Modellparameter:", ldamodel.get_params())  
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# MLP Classifier
print("MLP Classifier")
start_time = time.time()
mlp = MLPClassifier()
mlp_para = {"max_iter" : [i for i in [300,400]], "learning_rate_init": [i for i in [0.01,0.001,0.0001]], "hidden_layer_sizes": [i for i in [(100,),(50,),(200,),(50,25),(100,50,25),(50,100,50,25)]]}
mlp_grd_clf = HalvingGridSearchCV(mlp, mlp_para, scoring="f1", cv=5, n_jobs=-1)
mlp_grd_clf.fit(X_train,y_train)
model_with_best_mlp_parameters = mlp_grd_clf.best_estimator_
bmlpmodel = model_with_best_mlp_parameters.fit(X_train,y_train)
print(f'{"Akkuranz: " }{cross_val_score(bmlpmodel,X_train,y_train,scoring="accuracy",cv=10, n_jobs=-1).mean():.4f}')
print(bmlpmodel.get_params())   
y_predict = bmlpmodel.predict(X_test)
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')

In [None]:
# Roc-Graph Vergleich
ax = plt.gca()
plot_roc_curve(btmodel,X_test,y_test,ax=ax)
plot_roc_curve(gnbmodel,X_test,y_test,ax=ax)
plot_roc_curve(bsvmmodel,X_test,y_test,ax=ax)
plot_roc_curve(bkmodel,X_test,y_test,ax=ax)
plot_roc_curve(rcmodel,X_test,y_test,ax=ax)
plot_roc_curve(lrmodel,X_test,y_test,ax=ax)
plot_roc_curve(ldamodel,X_test,y_test,ax=ax)
plot_roc_curve(bmlpmodel,X_test,y_test,ax=ax)
plt.savefig("roc_model.png")
plt.show(block=False)

In [None]:
# Plot Feature Importance Training
model_list = [btmodel,gnbmodel,bsvmmodel,bkmodel,rcmodel,lrmodel,ldamodel,bmlpmodel]
file_list=["bt.png","gnb.png","bsv.png","bk.png","rc.png","lr.png","lda.png","bml.png"]
for model, file in zip(model_list,file_list):
    start_time = time.time()
    result = permutation_importance(model, X_train, y_train, n_jobs=-1)
    model_importances = pd.Series(result.importances_mean, index=features)
    model_importances.plot.bar(yerr=result.importances_std)
    print(model)
    print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
    plt.savefig(file)
    plt.show(block=False)

In [None]:
# Plot Feature Importance Test
model_list = [btmodel,gnbmodel,bsvmmodel,bkmodel,rcmodel,lrmodel,ldamodel,bmlpmodel]
file_list=["btb.png","gnbb.png","bsvb.png","bkb.png","rcb.png","lrb.png","ldab.png","bmlb.png"]
for model, file in zip(model_list,file_list):
    start_time = time.time()
    result = permutation_importance(model, X_test, y_test, n_jobs=-1)
    model_importances = pd.Series(result.importances_mean, index=features)
    model_importances.plot.bar(yerr=result.importances_std)
    print(model)
    print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
    plt.savefig(file)
    plt.show(block=False)

In [None]:
# Ensemble Learning
# Random Forest
start_time = time.time()
print("Random Forest")
rfc = RandomForestClassifier()
tree_para = {'criterion':['gini','entropy'],'n_estimators': [i for i in [100,200,500,1000]]}
grd_clf = HalvingGridSearchCV(rfc, tree_para, cv=5, n_jobs=-1)
grd_clf.fit(X_train, y_train)
model_with_best_rfc_parameters = grd_clf.best_estimator_
brfcmodel = model_with_best_rfc_parameters.fit(X_train,y_train)
print(f'{"Akkuranz: " }{cross_val_score(brfcmodel,X_train,y_train,scoring="accuracy",cv=10, n_jobs=-1).mean():.4f}')
print(brfcmodel.get_params())
y_predict = brfcmodel.predict(X_test)
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# Gradient Boosting
start_time = time.time()
print("Gradient Boosting")
gbc = GradientBoostingClassifier()
gbc_para = {'n_estimators': [i for i in [50,100,500,1000]],'learning_rate': [i for i in [0.01,0.1,0.2]]}
grd_clf = HalvingGridSearchCV(gbc, gbc_para, scoring="f1", cv=5, n_jobs=-1)
grd_clf.fit(X_train, y_train)
model_with_best_gbc_parameters = grd_clf.best_estimator_
bgbcmodel = model_with_best_gbc_parameters.fit(X_train,y_train)
print(f'{"Akkuranz: " }{cross_val_score(bgbcmodel,X_train,y_train,scoring="accuracy",cv=10,n_jobs=-1).mean():.4f}')
print(bgbcmodel.get_params())
y_predict = bgbcmodel.predict(X_test)
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
# Bagging
start_time = time.time()
print("Bagging Classifier")
bc = BaggingClassifier()
bcmodel = bc.fit(X_train,y_train)
y_predict = bcmodel.predict(X_test)
print(f'{"Akkuranz: " }{cross_val_score(bcmodel,X_train,y_train,scoring="accuracy",cv=10,n_jobs=-1).mean():.4f}')
print(bcmodel.get_params())  
print(classification_report(y_test,y_predict))
print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')

In [None]:
# Roc-Graph Vergleich Ensemble
ax = plt.gca()
plot_roc_curve(brfcmodel,X_test,y_test,ax=ax)
plot_roc_curve(bgbcmodel,X_test,y_test,ax=ax)
plot_roc_curve(bcmodel,X_test,y_test,ax=ax)
plt.savefig("roc_emodel.png")
plt.show(block=False)

In [None]:
# Plot Feature Importance Ensemble Training
emodel_list=[brfcmodel,bgbcmodel,bcmodel]
file_list=["brf.png","bgb.png","bc.png"]
for model, file in zip(emodel_list, file_list):
    start_time = time.time()
    result = permutation_importance(model, X_train, y_train, n_jobs=-1)
    model_importances = pd.Series(result.importances_mean, index=features)
    model_importances.plot.bar(yerr=result.importances_std)
    print(model)
    print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
    plt.savefig(file)
    plt.show(block=False)

In [None]:
# Plot Feature Importance Ensemble Test
emodel_list=[brfcmodel,bgbcmodel,bcmodel]
file_list=["brfb.png","bgbb.png","bcb.png"]
for model, file in zip(emodel_list, file_list):
    start_time = time.time()
    result = permutation_importance(model, X_test, y_test, n_jobs=-1)
    model_importances = pd.Series(result.importances_mean, index=features)
    model_importances.plot.bar(yerr=result.importances_std)
    print(model)
    print(f'{"Berechnungszeit: "}{time.time()-start_time:.2f}{" Sekunden"}')
    plt.savefig(file)
    plt.show(block=False)