## Imports

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import random
import statistics
from statistics import mean
import time

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# read data
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "label"],
                index_col=False)

In [3]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Preprocessing

In [4]:
# prepare features; note: fnlwgt and education are omitted
features = df[["age", "education-num", "capital-gain", "capital-loss", "hours-per-week", "workclass", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]]
target = df["label"]

In [5]:
def encode_and_impute_data(features, target):   
    
    # impute missing values
    imp = SimpleImputer(strategy="most_frequent")
    features_arr = imp.fit_transform(features)
    features = pd.DataFrame(features_arr, columns=features.columns)
    
    # ordinalEncoder for enumFeatures
    enumFeatures = features[["workclass", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]]
    continuousFeatures = features[["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]]
    
    ordinalEncoder = preprocessing.OrdinalEncoder()
    ordinalEncoder.fit(enumFeatures)
    enumFeatures = pd.DataFrame(ordinalEncoder.transform(enumFeatures), columns=["workclass", "marital-status", "occupation", "relationship", "race", "sex", "native-country"])
    features = pd.concat([continuousFeatures, enumFeatures], axis=1)
    
    # labelEncoder for target
    labelEncoder = preprocessing.LabelEncoder()
    labelEncoder.fit([" <=50K", " >50K"])
    target = labelEncoder.transform(target)
    
    return features, target

In [6]:
def oversample(features, target):
    ros = RandomOverSampler(random_state=0)
    features, target = ros.fit_resample(features, target)
    
    return features, target

## Construct classifiers

In [7]:
dummy = DummyClassifier(strategy='constant', constant=0) # due to oversampling, most-frequent is not beneficial
decisionTree = DecisionTreeClassifier()
randomForest = RandomForestClassifier()

In [8]:
# define lists to store measurements
dummy_accuracies = []
dummy_precisions = []
dummy_f1Scores = []
dummy_runtimesOfFitting = []
dummy_runtimesOfPredicting = []

dc_accuracies = []
dc_precisions = []
dc_f1Scores = []
dc_runtimesOfFitting = []
dc_runtimesOfPredicting = []

rf_accuracies = []
rf_precisions = []
rf_f1Scores = []
rf_runtimesOfFitting = []
rf_runtimesOfPredicting = []

## Train and predict classifiers

In [9]:
# split the data
skf = StratifiedKFold(n_splits=8, shuffle=True)
for train, test in skf.split(features, target):

    X_train = np.take(features, train, 0)
    y_train = np.take(target, train, 0)
    X_test = np.take(features, test, 0)
    y_test = np.take(target, test, 0)
    
    X_train, y_train = encode_and_impute_data(X_train, y_train)
    X_train, y_train = oversample(X_train, y_train)
    X_test, y_test = encode_and_impute_data(X_test, y_test)
    
    
    # dummy
    startTime = time.time()
    dummy.fit(X_train, y_train) # fit
    endTime = time.time()
    elapsedTime = endTime - startTime
    dummy_runtimesOfFitting.append(elapsedTime)
    
    startTime = time.time()
    dummy_y_pred = dummy.predict(X_test) # predict
    endTime = time.time()
    elapsedTime = endTime - startTime
    dummy_runtimesOfPredicting.append(elapsedTime)
    
    dummy_accuracies.append(metrics.accuracy_score(y_test, dummy_y_pred))
    dummy_precisions.append(metrics.precision_score(y_test, dummy_y_pred, zero_division=0))
    dummy_f1Scores.append(metrics.f1_score(y_test, dummy_y_pred))
    
    
    # decision tree
    startTime = time.time()
    decisionTree.fit(X_train, y_train) # fit
    endTime = time.time()
    elapsedTime = endTime - startTime
    dc_runtimesOfFitting.append(elapsedTime)
    
    startTime = time.time()
    dc_y_pred = decisionTree.predict(X_test) # predict
    endTime = time.time()
    elapsedTime = endTime - startTime
    dc_runtimesOfPredicting.append(elapsedTime)
    
    dc_accuracies.append(metrics.accuracy_score(y_test, dc_y_pred))
    dc_precisions.append(metrics.precision_score(y_test, dc_y_pred, zero_division=0))
    dc_f1Scores.append(metrics.f1_score(y_test, dc_y_pred))
    
    
    # random forest
    startTime = time.time()
    randomForest.fit(X_train, y_train) # fit
    endTime = time.time()
    elapsedTime = endTime - startTime
    rf_runtimesOfFitting.append(elapsedTime)
    
    startTime = time.time()
    rf_y_pred = randomForest.predict(X_test) # predict
    endTime = time.time()
    elapsedTime = endTime - startTime
    rf_runtimesOfPredicting.append(elapsedTime)
    
    rf_accuracies.append(metrics.accuracy_score(y_test, rf_y_pred))
    rf_precisions.append(metrics.precision_score(y_test, rf_y_pred, zero_division=0))
    rf_f1Scores.append(metrics.f1_score(y_test, rf_y_pred))

## Evaluation

In [10]:
# print results
print("-------------------------------------------------------------")
print("Dummy accuracy: " + str(mean(dummy_accuracies)))
print("Dummy standard deviation of accuracy: " + str(statistics.pstdev(dummy_accuracies)))
print("Dummy precision: " + str(mean(dummy_precisions)))
print("Dummy F1-score: " + str(mean(dummy_f1Scores)))
print("Dummy fitting time (avg in seconds): " + str(mean(dummy_runtimesOfFitting)))
print("Dummy predicting time (avg in seconds): " + str(mean(dummy_runtimesOfPredicting)))
print("------------------------------------------------------------")
print("Decision Tree accuracy: " + str(mean(dc_accuracies)))
print("Decision Tree standard deviation of accuracy: " + str(statistics.pstdev(dc_accuracies)))
print("Decision Tree precision: " + str(mean(dc_precisions)))
print("Decision Tree F1-score: " + str(mean(dc_f1Scores)))
print("Decision Tree fitting time (avg in seconds): " + str(mean(dc_runtimesOfFitting)))
print("Decision Tree predicting time (avg in seconds): " + str(mean(dc_runtimesOfPredicting)))
print("-------------------------------------------------------------")
print("Random Forest accuracy: " + str(mean(rf_accuracies)))
print("Random Forest standard deviation of accuracy: " + str(statistics.pstdev(rf_accuracies)))
print("Random Forest precision: " + str(mean(rf_precisions)))
print("Random Forest F1-score: " + str(mean(rf_f1Scores)))
print("Random Forest fitting time (avg in seconds): " + str(mean(rf_runtimesOfFitting)))
print("Random Forest predicting time (avg in seconds): " + str(mean(rf_runtimesOfPredicting)))
print("-------------------------------------------------------------")

-------------------------------------------------------------
Dummy accuracy: 0.7591904475655397
Dummy standard deviation of accuracy: 6.167682383990249e-05
Dummy precision: 0.0
Dummy F1-score: 0.0
Dummy fitting time (avg in seconds): 0.0012544691562652588
Dummy predicting time (avg in seconds): 0.00012513995170593262
------------------------------------------------------------
Decision Tree accuracy: 0.8039376391531882
Decision Tree standard deviation of accuracy: 0.010015483889362808
Decision Tree precision: 0.5924946624674278
Decision Tree F1-score: 0.5942315387917761
Decision Tree fitting time (avg in seconds): 0.15137013792991638
Decision Tree predicting time (avg in seconds): 0.00536954402923584
-------------------------------------------------------------
Random Forest accuracy: 0.8413133707164657
Random Forest standard deviation of accuracy: 0.008631325045152106
Random Forest precision: 0.6749440561408799
Random Forest F1-score: 0.6658340856822171
Random Forest fitting time (av