In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
import pandas as pd
import numpy as np
import os

In [2]:
file = 'static/data/income_evaluation.csv'
data = pd.read_csv(file)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data_with_labels = data

In [4]:
working_class = data['workclass'].unique()
working_class
EmploymentLabels = {}
for workclass in working_class:
    result = np.where((working_class==workclass))
    EmploymentLabels[workclass] = result[0][0]
data_with_labels['workclassLabels'] = data_with_labels['workclass']
data_with_labels['workclassLabels'] = data['workclassLabels'].map(EmploymentLabels) 

In [5]:
marital_statuses = data['marital-status'].unique()
MaritalLabels = {}
for status in marital_statuses:
    result = np.where((marital_statuses==status))
    MaritalLabels[status] = result[0][0]
data_with_labels['maritalLabels'] = data_with_labels['marital-status']
data_with_labels['maritalLabels'] = data['maritalLabels'].map(MaritalLabels) 

In [6]:
occupations = data['occupation'].unique()
OccupationLabels = {}
for occupation in occupations:
    result = np.where((occupations==occupation))
    OccupationLabels[occupation] = result[0][0]
data_with_labels['occupationLabels'] = data_with_labels['occupation']
data_with_labels['occupationLabels'] = data['occupationLabels'].map(OccupationLabels) 

In [7]:
relationships = data['relationship'].unique()
RelationshipLabels = {}
for relationship in relationships:
    result = np.where((relationships==relationship))
    RelationshipLabels[relationship] = result[0][0]
data_with_labels['relationshipLabels'] = data_with_labels['relationship']
data_with_labels['relationshipLabels'] = data['relationshipLabels'].map(RelationshipLabels)

In [8]:
races = data['race'].unique()
RaceLabels = {}
for race in races:
    result = np.where((races==race))
    RaceLabels[race] = result[0][0]
data_with_labels['raceLabels'] = data_with_labels['race']
data_with_labels['raceLabels'] = data['raceLabels'].map(RaceLabels) 

In [9]:
sexes = data['sex'].unique()
SexLabels = {}
for sex in sexes:
    result = np.where((sexes==sex))
    SexLabels[sex] = result[0][0]
data_with_labels['sexLabels'] = data_with_labels['sex']
data_with_labels['sexLabels'] = data['sexLabels'].map(SexLabels) 

In [10]:
countries = data['native-country'].unique()
CountryLabels = {}
for country in countries:
    result = np.where((countries==country))
    CountryLabels[country] = result[0][0]
data_with_labels['countryLabels'] = data_with_labels['native-country']
data_with_labels['countryLabels'] = data['countryLabels'].map(CountryLabels) 


In [11]:
incomes = data['income'].unique()
IncomeLabels = {}
for income in incomes:
    result = np.where((incomes==income))
    IncomeLabels[income] = result[0][0]
data_with_labels['incomeLabels'] = data_with_labels['income']
data_with_labels['incomeLabels'] = data['incomeLabels'].map(IncomeLabels) 

In [12]:
labeled_data = data_with_labels[['age','education-num','workclassLabels','maritalLabels','occupationLabels','relationshipLabels','raceLabels','sexLabels','countryLabels','hours-per-week','incomeLabels']]
labeled_data

Unnamed: 0,age,education-num,workclassLabels,maritalLabels,occupationLabels,relationshipLabels,raceLabels,sexLabels,countryLabels,hours-per-week,incomeLabels
0,39,13,0,0,0,0,0,0,0,40,0
1,50,13,1,1,1,1,0,0,0,13,0
2,38,9,2,2,2,0,0,0,0,40,0
3,53,7,2,1,2,1,1,0,0,40,0
4,28,13,2,1,3,2,1,1,1,40,0
...,...,...,...,...,...,...,...,...,...,...,...
32556,27,12,2,1,10,2,0,1,0,38,0
32557,40,9,2,1,9,1,0,0,0,40,1
32558,58,9,2,6,0,4,0,1,0,40,0
32559,22,9,2,0,0,3,0,0,0,20,0


In [13]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(42) 

In [13]:
X = data_with_labels[['age','education-num','workclassLabels','maritalLabels','occupationLabels','raceLabels','sexLabels','hours-per-week','countryLabels']]
y = data_with_labels[['incomeLabels']]

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Data Processing


In [15]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train) 

In [16]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
from tensorflow.keras.utils import to_categorical

In [18]:
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_train_categorical

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [19]:
from tensorflow.keras.models import Sequential

model = Sequential()

In [20]:
from tensorflow.keras.layers import Dense
number_inputs = 9
number_hidden_nodes = 10
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

In [21]:
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                100       
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 22        
Total params: 122
Trainable params: 122
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# Hint: your output layer in this example is using software for logistic regression (categorical)
# If your output layer activation was `linear` then you may want to use `mse` for loss
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [24]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 24420 samples
Epoch 1/100
24420/24420 - 1s - loss: 0.4520 - accuracy: 0.7754
Epoch 2/100
24420/24420 - 1s - loss: 0.3962 - accuracy: 0.8125
Epoch 3/100
24420/24420 - 1s - loss: 0.3860 - accuracy: 0.8174
Epoch 4/100
24420/24420 - 1s - loss: 0.3810 - accuracy: 0.8194
Epoch 5/100
24420/24420 - 1s - loss: 0.3784 - accuracy: 0.8187
Epoch 6/100
24420/24420 - 1s - loss: 0.3765 - accuracy: 0.8199
Epoch 7/100
24420/24420 - 1s - loss: 0.3755 - accuracy: 0.8209
Epoch 8/100
24420/24420 - 1s - loss: 0.3740 - accuracy: 0.8226
Epoch 9/100
24420/24420 - 1s - loss: 0.3729 - accuracy: 0.8228
Epoch 10/100
24420/24420 - 1s - loss: 0.3718 - accuracy: 0.8237
Epoch 11/100
24420/24420 - 1s - loss: 0.3708 - accuracy: 0.8244
Epoch 12/100
24420/24420 - 1s - loss: 0.3700 - accuracy: 0.8254
Epoch 13/100
24420/24420 - 1s - loss: 0.3689 - accuracy: 0.8247
Epoch 14/100
24420/24420 - 1s - loss: 0.3681 - accuracy: 0.8256
Epoch 15/100
24420/24420 - 1s - loss: 0.3668 - accuracy: 0.8260
Epoch 16/100
24420/24420 -

<tensorflow.python.keras.callbacks.History at 0x1cbf92a5ba8>

In [25]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

8141/8141 - 0s - loss: 0.3454 - accuracy: 0.8375
Loss: 0.34537645272417306, Accuracy: 0.8374892473220825


## Making Predictions with new data

In [27]:
import numpy as np
new_data = np.array([[52,9,6,1,1,0,1,40,0]])
print(f"Predicted class: {model.predict(new_data)}")

Predicted class: [[0.8886916  0.11130838]]


In [28]:
model.save('neural_network_trained.h5')