In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
import keras.models as models
import keras.layers as layers
import keras.metrics as metrics

In [60]:
def get_clf_eval(y_test, pred):
    pred = (pred > 0.6)
    acc = accuracy_score(y_test, pred)
    pre = precision_score(y_test, pred, pos_label=1)
    re = recall_score(y_test, pred, pos_label=1)
    f1 = f1_score(y_test, pred, pos_label=1)
    auc = roc_auc_score(y_test, pred)

    return acc, pre, re, f1, auc

def print_clf_eval(y_test, pred):
    pred = (pred > 0.6)
    confusion = confusion_matrix(y_test, pred)
    acc, pre, re, f1, auc = get_clf_eval(y_test, pred)

    print("=Confusion matrix=")
    print(confusion)
    print("==================")

    print(f"Acc : {acc:.4f}, Pre : {pre:.4f}")
    print(f"Re : {re:.4f}, F1 : {f1:.4f}, AUC : {auc:.4f}")
    
def get_result(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    pred = (model.predict(X_test) > 0.6)
    
    return get_clf_eval(Y_test, pred)

In [3]:
data = pd.read_csv('Census_Income.csv')

In [4]:
data.dropna(axis=0, inplace=True)
data.reset_index(drop=True, inplace=True)
lb = LabelEncoder()
data['income'] = lb.fit_transform(data['income'])

In [5]:
scaler = StandardScaler()

data['age'] = scaler.fit_transform(np.array(data['age']).reshape(-1, 1))
data['fnlwgt'] = scaler.fit_transform(np.array(data['fnlwgt']).reshape(-1, 1))
data['education-num'] = scaler.fit_transform(np.array(data['education-num']).reshape(-1, 1))
data['capital-gain'] = scaler.fit_transform(np.array(data['capital-gain']).reshape(-1, 1))
data['capital-loss'] = scaler.fit_transform(np.array(data['capital-loss']).reshape(-1, 1))
data['hours-per-week'] = scaler.fit_transform(np.array(data['hours-per-week']).reshape(-1, 1))
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.037656,State-gov,-1.063678,Bachelors,1.138803,Never-married,Adm-clerical,Not-in-family,White,Male,0.154812,-0.231072,-0.094472,United-States,0
1,0.877496,Self-emp-not-inc,-1.009266,Bachelors,1.138803,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.146227,-0.231072,-2.421087,United-States,0
2,-0.038693,Private,0.233290,HS-grad,-0.445800,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.146227,-0.231072,-0.094472,United-States,0
3,1.106543,Private,0.412394,11th,-1.238102,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.146227,-0.231072,-0.094472,United-States,0
4,-0.802184,Private,1.385969,Bachelors,1.138803,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.146227,-0.231072,-0.094472,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4926,0.343052,Private,-0.976976,Assoc-voc,0.346501,Married-civ-spouse,Craft-repair,Husband,White,Male,-0.146227,-0.231072,4.989614,United-States,0
4927,1.870034,Private,1.604011,Masters,1.534954,Separated,Prof-specialty,Not-in-family,White,Female,-0.146227,-0.231072,-0.094472,United-States,1
4928,0.572099,Private,-0.258878,7th-8th,-2.426555,Married-civ-spouse,Transport-moving,Husband,White,Male,-0.146227,-0.231072,-0.094472,United-States,0
4929,1.564637,Private,-0.105220,HS-grad,-0.445800,Married-civ-spouse,Sales,Husband,White,Male,-0.146227,-0.231072,-0.094472,United-States,0


In [6]:
X = data.iloc[:, 0:-1]
Y = data.iloc[:, -1]

In [7]:
X_log = X.copy()
Y_log = Y.copy()

for column in X_log.columns:
    if X_log[column].dtype == object:
        before = X_log[[column]]
        encoder = OneHotEncoder()
        temp = encoder.fit_transform(before)
        X_log.drop(column, axis=1, inplace=True)
        X_log = pd.concat([pd.DataFrame(temp.toarray(), columns=encoder.categories_), X_log], axis=1)
        del encoder
        
X_train_log, X_test_log, Y_train_log, Y_test_log = train_test_split(X_log, Y_log, test_size=0.3, random_state=13, stratify=Y)
X_train_log.reset_index(drop=True, inplace=True)
X_test_log.reset_index(drop=True, inplace=True)
Y_train_log.reset_index(drop=True, inplace=True)
Y_test_log.reset_index(drop=True, inplace=True)

In [16]:
X_rf = X.copy()
Y_rf = Y.copy()

for column in X_rf.columns:
    if X_rf[column].dtype == object:
        before = X_rf[[column]]
        encoder = LabelEncoder()
        temp = encoder.fit_transform(before)
        X_rf[column] = temp
        del encoder
        
X_train_rf, X_test_rf, Y_train_rf, Y_test_rf = train_test_split(X_rf, Y_rf, test_size=0.3, random_state=13, stratify=Y)
X_train_rf.reset_index(drop=True, inplace=True)
X_test_rf.reset_index(drop=True, inplace=True)
Y_train_rf.reset_index(drop=True, inplace=True)
Y_test_rf.reset_index(drop=True, inplace=True)

  return f(*args, **kwargs)


In [8]:
class_weight = {0:0.25, 1:0.75}

In [54]:
model_log = models.Sequential(name='Logistic')
model_log.add(layers.Dense(1, activation='sigmoid', input_shape=(X_train_log.shape[1],)))
#model.add(layers.Dense(1, activation='sigmoid'))
model_log.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [55]:
model_log.summary()

Model: "Logistic"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 1)                 103       
Total params: 103
Trainable params: 103
Non-trainable params: 0
_________________________________________________________________


In [56]:
model_log.fit(X_train_log, Y_train_log, epochs=200, use_multiprocessing=True, workers=-1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/20

Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x16dfab820>

In [57]:
model_log.evaluate(X_test_log, Y_test_log)



[0.3209892213344574, 0.8594594597816467]

In [62]:
temp = []
temp.append(get_result(model_log, X_train_log, Y_train_log, X_test_log, Y_test_log))

columns = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC-AUC']

pd.DataFrame(temp, index=["Log"], columns=columns)



Unnamed: 0,Accuracy,Precision,Recall,F1-score,ROC-AUC
Log,0.85,0.840183,0.495957,0.623729,0.732198


In [63]:
print_clf_eval(Y_test_log, model_log.predict(X_test_log))

=Confusion matrix=
[[1074   35]
 [ 187  184]]
Acc : 0.8500, Pre : 0.8402
Re : 0.4960, F1 : 0.6237, AUC : 0.7322


In [64]:
model_dnn = models.Sequential(name = 'DNN')
model_dnn.add(layers.Dense(1, activation='sigmoid', input_shape=(X_train_rf.shape[1],)))
model_dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [65]:
model_dnn.summary()

Model: "DNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 1)                 15        
Total params: 15
Trainable params: 15
Non-trainable params: 0
_________________________________________________________________


In [66]:
model_dnn.fit(X_train_rf, Y_train_rf, epochs=200, use_multiprocessing=True, workers=-1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/20

Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x1750476d0>

In [67]:
model_dnn.evaluate(X_test_rf, Y_test_rf)



[0.40201324224472046, 0.8195946216583252]

In [68]:
temp = []
temp.append(get_result(model_dnn, X_train_rf, Y_train_rf, X_test_rf, Y_test_rf))

columns = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC-AUC']

pd.DataFrame(temp, index=["DNN"], columns=columns)



Unnamed: 0,Accuracy,Precision,Recall,F1-score,ROC-AUC
DNN,0.810135,0.826087,0.307278,0.447937,0.642818


In [69]:
print_clf_eval(Y_test_rf, model_dnn.predict(X_test_rf))

=Confusion matrix=
[[1085   24]
 [ 257  114]]
Acc : 0.8101, Pre : 0.8261
Re : 0.3073, F1 : 0.4479, AUC : 0.6428
