In [1]:
import pandas as pd 
import numpy as np 

In [2]:
file = 'static/data/income_evaluation.csv'
data = pd.read_csv(file)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data_with_labels = data

In [4]:
working_class = data['workclass'].unique()
working_class
EmploymentLabels = {}
for workclass in working_class:
    result = np.where((working_class==workclass))
    EmploymentLabels[workclass] = result[0][0]
data_with_labels['workclassLabels'] = data_with_labels['workclass']
data_with_labels['workclassLabels'] = data['workclassLabels'].map(EmploymentLabels) 

In [5]:
marital_statuses = data['marital-status'].unique()
MaritalLabels = {}
for status in marital_statuses:
    result = np.where((marital_statuses==status))
    MaritalLabels[status] = result[0][0]
data_with_labels['maritalLabels'] = data_with_labels['marital-status']
data_with_labels['maritalLabels'] = data['maritalLabels'].map(MaritalLabels) 

In [6]:
occupations = data['occupation'].unique()
OccupationLabels = {}
for occupation in occupations:
    result = np.where((occupations==occupation))
    OccupationLabels[occupation] = result[0][0]
data_with_labels['occupationLabels'] = data_with_labels['occupation']
data_with_labels['occupationLabels'] = data['occupationLabels'].map(OccupationLabels) 

In [7]:
relationships = data['relationship'].unique()
RelationshipLabels = {}
for relationship in relationships:
    result = np.where((relationships==relationship))
    RelationshipLabels[relationship] = result[0][0]
data_with_labels['relationshipLabels'] = data_with_labels['relationship']
data_with_labels['relationshipLabels'] = data['relationshipLabels'].map(RelationshipLabels) 

In [8]:
races = data['race'].unique()
RaceLabels = {}
for race in races:
    result = np.where((races==race))
    RaceLabels[race] = result[0][0]
data_with_labels['raceLabels'] = data_with_labels['race']
data_with_labels['raceLabels'] = data['raceLabels'].map(RaceLabels) 

In [9]:
sexes = data['sex'].unique()
SexLabels = {}
for sex in sexes:
    result = np.where((sexes==sex))
    SexLabels[sex] = result[0][0]
data_with_labels['sexLabels'] = data_with_labels['sex']
data_with_labels['sexLabels'] = data['sexLabels'].map(SexLabels) 

In [10]:
countries = data['native-country'].unique()
CountryLabels = {}
for country in countries:
    result = np.where((countries==country))
    CountryLabels[country] = result[0][0]
data_with_labels['countryLabels'] = data_with_labels['native-country']
data_with_labels['countryLabels'] = data['countryLabels'].map(CountryLabels) 

In [12]:
labeled_data = data_with_labels[['age','education-num','workclassLabels','maritalLabels','occupationLabels','relationshipLabels','raceLabels','sexLabels','countryLabels','income']]
labeled_data

Unnamed: 0,age,education-num,workclassLabels,maritalLabels,occupationLabels,relationshipLabels,raceLabels,sexLabels,countryLabels,income
0,39,13,0,0,0,0,0,0,0,<=50K
1,50,13,1,1,1,1,0,0,0,<=50K
2,38,9,2,2,2,0,0,0,0,<=50K
3,53,7,2,1,2,1,1,0,0,<=50K
4,28,13,2,1,3,2,1,1,1,<=50K
...,...,...,...,...,...,...,...,...,...,...
32556,27,12,2,1,10,2,0,1,0,<=50K
32557,40,9,2,1,9,1,0,0,0,>50K
32558,58,9,2,6,0,4,0,1,0,<=50K
32559,22,9,2,0,0,3,0,0,0,<=50K


## Data Pre-Processing

In [14]:
X = labeled_data.drop("income", axis=1)
y = labeled_data["income"]
print(X.shape, y.shape)

(32561, 9) (32561,)


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)

In [17]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [19]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Create a Deep Learning Model

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [33]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=15, activation='relu', input_dim=9))
model.add(Dense(units=15, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [34]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [35]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 15)                150       
_________________________________________________________________
dense_10 (Dense)             (None, 15)                240       
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 32        
Total params: 422
Trainable params: 422
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Epoch 1/60
764/764 - 1s - loss: 0.4749 - accuracy: 0.7760
Epoch 2/60
764/764 - 0s - loss: 0.3955 - accuracy: 0.8121
Epoch 3/60
764/764 - 0s - loss: 0.3804 - accuracy: 0.8201
Epoch 4/60
764/764 - 0s - loss: 0.3768 - accuracy: 0.8229
Epoch 5/60
764/764 - 0s - loss: 0.3750 - accuracy: 0.8222
Epoch 6/60
764/764 - 0s - loss: 0.3731 - accuracy: 0.8236
Epoch 7/60
764/764 - 0s - loss: 0.3724 - accuracy: 0.8238
Epoch 8/60
764/764 - 0s - loss: 0.3717 - accuracy: 0.8254
Epoch 9/60
764/764 - 0s - loss: 0.3705 - accuracy: 0.8235
Epoch 10/60
764/764 - 0s - loss: 0.3694 - accuracy: 0.8251
Epoch 11/60
764/764 - 0s - loss: 0.3690 - accuracy: 0.8252
Epoch 12/60
764/764 - 0s - loss: 0.3683 - accuracy: 0.8255
Epoch 13/60
764/764 - 0s - loss: 0.3675 - accuracy: 0.8256
Epoch 14/60
764/764 - 0s - loss: 0.3680 - accuracy: 0.8260
Epoch 15/60
764/764 - 0s - loss: 0.3666 - accuracy: 0.8275
Epoch 16/60
764/764 - 0s - loss: 0.3670 - accuracy: 0.8267
Epoch 17/60
764/764 - 0s - loss: 0.3665 - accuracy: 0.8260
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7ffd98aa2670>

## Quantify  Trained Model

In [37]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

255/255 - 0s - loss: 0.3498 - accuracy: 0.8336
Normal Neural Network - Loss: 0.3497696816921234, Accuracy: 0.8335585594177246
