# Wer verdient 50k?

In [None]:
# Arbeitsbibliotheken
import numpy as np
import pandas as pd
import time
# Visualisierungsbibliotheken
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Spaltennamen händisch aus der Dokumentation kopiert
heads = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","target"]
# Herunterladen der Daten
censusdatatrain = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", names=heads)
censusdatatest = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", names=heads, header=0)
# Speichern der Daten für die Maschine Learning Algorithmen
censusdatatrain.to_csv("adult.data")
censusdatatest.to_csv("adult.test")
# Zusammenführen der Daten für die deskriptive Analyse
censusdata = censusdatatrain.append(censusdatatest, ignore_index=True)

In [None]:
# Recodierung der Variablen
# Workclass
censusdata["workclass"] = censusdata["workclass"].replace(to_replace=[" Self-emp-not-inc", " Self-emp-inc"], value="Selfemp")
censusdata["workclass"] = censusdata["workclass"].replace(to_replace=[" Local-gov", " State-gov", " Federal-gov"], value="Goverm")
censusdata["workclass"] = censusdata["workclass"].replace(to_replace=[" ?", " Without-pay", " Never-worked"], value="Residualwc")
# Education
censusdata["education"] = censusdata["education"].replace(to_replace=[" Masters"," Doctorate"], value="higher_academic")
censusdata["education"] = censusdata["education"].replace(to_replace=[" Assoc-voc", " Assoc-acdm"," Prof-school"], value="Other")
censusdata["education"] = censusdata["education"].replace(to_replace=[" Preschool", " 1st-4th", " 5th-6th"," 7th-8th", " 9th", " 10th", " 11th", " 12th"], value="Dropout")
# native country
censusdata["native-country"] = ["US" if x==" United-States" or x==" Outlying-US(Guam-USVI-etc)" else "not_US" for x in censusdata["native-country"]]
# marital-status
censusdata["marital-status"] = censusdata["marital-status"].replace(to_replace=[" Married-civ-spouse"," Married-spouse-absent"," Married-AF-spouse"], value="married")
censusdata["marital-status"] = censusdata["marital-status"].replace(to_replace=[" Divorced"," Separated"," Widowed"], value="separated")
# race
censusdata["race"] = censusdata["race"].replace(to_replace=[" Other"," Amer-Indian-Eskimo"," Asian-Pac-Islander"], value="other_race")
# occupation
censusdata["occupation"] = censusdata["occupation"].replace(to_replace=[" Armed-Forces", " Protective-serv"], value="Security")
censusdata["occupation"] = censusdata["occupation"].replace(to_replace=[" Other-service", " Priv-house-serv", " ?"], value="Other_Services")
# capital-gain
censusdata["oacgain"] = [1 if x>=censusdata["capital-gain"][censusdata["capital-gain"]>0].median() else 0 for x in censusdata["capital-gain"]]
# capital-loss
censusdata["oacloss"] = [1 if x>=censusdata["capital-loss"][censusdata["capital-loss"]>0].median() else 0 for x in censusdata["capital-loss"]]
# age
censusdata["oaage"] = [1 if x>=censusdata["age"].median() else 0 for x in censusdata["age"]]
# hours per week
censusdata["oahpw"] = [1 if x>=censusdata["hours-per-week"][censusdata["hours-per-week"]>0].median() else 0 for x in censusdata["hours-per-week"]]
# Target
censusdata["target"] = censusdata["target"].replace(" >50K"," >50K.").replace(" <=50K"," <=50K.")

In [None]:
# Baseline aller Daten
print("Baseline aller Daten:", censusdata["target"].replace(" >50K"," >50K.").replace(" <=50K"," <=50K.").value_counts().max()/len(censusdata)*100)

In [None]:
# Schaubilder der Variablen mit Einkommensverteilung
headskk = ["workclass","education","occupation"]
headskb = ["race","sex","native-country","marital-status","oacgain","oacloss","oahpw"]
figure_list1=["wc.png","edu.png","occ.png"]
figure_list2=["race.png","sex.png","nc.png","ms.png","cgain.png","closs.png","hpw.png"]
for variable, figure in zip(headskk,figure_list1):    
    plt.figure(figsize=(20,5))
    sns.countplot(x=censusdata[variable], hue=censusdata["target"])
    plt.savefig(figure)
    plt.show()
for variable, figure in zip(headskb,figure_list2):    
    plt.figure(figsize=(10,5))
    sns.countplot(x=censusdata[variable], hue=censusdata["target"])
    plt.savefig(figure)
    plt.show()

In [None]:
# Kreuztabellen der Variablen mit Einkommen
for x in headskk:
    display(pd.crosstab(index=censusdata[x], columns=censusdata["target"], margins=False).apply(lambda zeile: zeile/zeile.sum(), axis=1))
for x in headskb:
    display(pd.crosstab(index=censusdata[x], columns=censusdata["target"], margins=False).apply(lambda zeile: zeile/zeile.sum(), axis=1))    