In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score

In [56]:
cols = ["age","workclass","fnwlgt","education","educationnum","maritalstatus","occupation","raltionship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","income","class"]

df = pd.read_csv("income.csv",names=cols)
if df["income"].equals(">50K"):
    df["class"] = 0
else:
    df["class"] = 1
df.head()

Unnamed: 0,age,workclass,fnwlgt,education,educationnum,maritalstatus,occupation,raltionship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,1


### Draw Graph

In [57]:
def drawGraph():
    for i in cols[:-1]:
        plt.hist(df[df["class"] == 1][i],color='blue',label="<=50K",alpha=0.7,density=True)
        plt.title(i)
        plt.xlabel(i)
        plt.ylabel("Distribution")
        plt.legend()
        plt.show()

### Train, Validation, test Datasets

In [58]:
from sklearn.discriminant_analysis import StandardScaler


def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

### Definition of methods to get all different options

In [59]:

def getAllWorkclasses():
    names = []
    workclasses = df["workclass"]

    for i in workclasses:
        if not i in names:
            names.append(i)
    return names

def getAllEducation():
    names = []
    education = df["education"]
    for i in education:
        if not i in names:
            names.append(i)
    return names
def getAllOccupation():
    names = []
    column = df["occupation"]
    for i in column:
        if not i in names:
            names.append(i)
    return names
def getAllMaritalstatus():
    names = []
    column = df["maritalstatus"]
    for i in column:
        if not i in names:
            names.append(i)
    return names
def getAllRelationship():
    names = []
    column = df["raltionship"]
    for i in column:
        if not i in names:
            names.append(i)
    return names
def getAllRace():
    names = []
    column = df["race"]
    for i in column:
        if not i in names:
            names.append(i)
    return names
def getAllNations():
    names = []
    column = df["native-country"]
    for i in column:
        if not i in names:
            names.append(i)
    return names

In [60]:
workclasses_names = getAllWorkclasses()
print (len(workclasses_names) , "workclasses")
print (len(getAllEducation()), "education")
print (len(getAllMaritalstatus()),"maritalStatus")
print (len(getAllOccupation()),"occupation")
print (len(getAllRace()),"races")

9 workclasses
16 education
7 maritalStatus
15 occupation
5 races


In [61]:
education_names = getAllEducation()
maritalstatus_names = getAllMaritalstatus()
occupation_status = getAllOccupation()
races_names = getAllRace()
relationship_names = getAllRelationship()
nations_names = getAllNations()

### Encode of the strings to int values

In [62]:
newDf = pd.DataFrame()
newDf["age"] = df["age"]
def encodeWorkclass():
    workclasses_encoded = []
    for i in range(len(df)):
        for j in range(len(workclasses_names)):
            if df["workclass"][i] == ((workclasses_names[j])):
                workclasses_encoded.append(j)
    newDf["workclass"] = pd.Series(workclasses_encoded).values
def encodeEducation():
    education_encoded = []
    for i in range(len(df)):
        for j in range(len(education_names)):
            if df["education"][i] == ((education_names[j])):
                education_encoded.append(j)
    newDf["education"] = pd.Series(education_encoded).values
def encodeMaritalStatus():
    maritalstatus_encoded = []
    for i in range(len(df)):
        for j in range(len(maritalstatus_names)):
            if df["maritalstatus"][i] == ((maritalstatus_names[j])):
                maritalstatus_encoded.append(j)
    newDf["maritalstatus"] = pd.Series(maritalstatus_encoded).values
def encodeOccupation():
    occupation_encoded = []
    for i in range(len(df)):
        for j in range(len(occupation_status)):
            if df["occupation"][i] == ((occupation_status[j])):
                occupation_encoded.append(j)
    newDf["occupation"] = pd.Series(occupation_encoded).values
    return occupation_encoded
def encodeRelationShip():
    relationship_encoded = []
    for i in range(len(df)):
        for j in range(len(relationship_names)):
            if df["raltionship"][i] == ((relationship_names[j])):
                relationship_encoded.append(j)
    newDf["raltionship"] = pd.Series(relationship_encoded).values
    return relationship_encoded
def encodeNations():
    nations_encoded = []
    for i in range(len(df)):
        for j in range(len(nations_names)):
            if df["native-country"][i] == ((nations_names[j])):
                nations_encoded.append(j)
    newDf["native-country"] = pd.Series(nations_encoded).values
    return nations_encoded
def encodeRace():
    race_encoded = []
    for i in range(len(df)):
        for j in range(len(races_names)):
            if df["race"][i] == ((races_names[j])):
                race_encoded.append(j)
    newDf["race"] = pd.Series(race_encoded).values
    return race_encoded
def encodeSex():
    sex_encoded = []
    for i in range(len(df)):
        if df["sex"][i] == 'Male':
            sex_encoded.append(1)
        else:
            sex_encoded.append(0)
    newDf["sex"] = pd.Series(sex_encoded).values
    return sex_encoded
encodeWorkclass()
encodeEducation()
encodeMaritalStatus()
encodeOccupation()
encodeRelationShip()
encodeRace()
encodeSex()
newDf["capital-gain"] = df["capital-gain"]
newDf["capital-loss"] = df["capital-loss"]
newDf["hours-per-week"] = df["hours-per-week"]
encodeNations()
newDf["income"] = df["income"]
newDf.to_csv("income_encoded.csv",index=True)
        

## New Table for decoding

In [63]:
df.head()

Unnamed: 0,age,workclass,fnwlgt,education,educationnum,maritalstatus,occupation,raltionship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,1


In [76]:
encodeDf = pd.DataFrame()
encodeDf["age"] = ["already int"]
pd.concat([encodeDf,workclasses_names],ignore_index=True,axis=1)
encodeDf.to_csv("uebersicht.csv")

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

## Splitting the dataset

In [None]:
train,valid,test =np.split(newDf.sample(frac=1),[int(0.6*len(df)),int(0.8*len(df))])

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

# The KNN-Model

## Training the Model

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=12)
knn_model.fit(X_train, y_train)

## Validation

In [None]:
knn_model.predict([[50,6,4,4,1,1,0,1,1,1,40,40]])

array(['>50K'], dtype=object)

In [None]:
predictions = knn_model.predict(X_valid)
score = accuracy_score(y_valid,predictions)
score

0.7776412776412777

In [None]:
knn_model.fit(X_test,y_test)
test_predictions = knn_model.predict(X_valid)
test_score = accuracy_score(y_valid,test_predictions)
test_score

0.816031941031941

# The DecisionTreeClassifier Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

## Training the model

In [None]:
dtc_model = DecisionTreeClassifier()
dtc_model.fit(X_train,y_train)

## Validate

In [None]:
dtc_predictions = dtc_model.predict(X_valid)
dtc_score = accuracy_score(y_valid,dtc_predictions)
dtc_score

0.7989864864864865

In [None]:
dtc_model.fit(X_test,y_test)
dtc_test_predictions = dtc_model.predict(X_valid)
dtc_test_score = accuracy_score(y_valid,dtc_test_predictions)
dtc_test_score

0.8086609336609336