In [None]:
from __future__ import print_function
import tensorflow as tf
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization
from keras import regularizers
from keras import backend as K
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import random
import requests
import zipfile
import io

# Fix random seed for reproducibility.
np.random.seed(1337)

## Read and Clean in Data

In [None]:
# Download and extract data.
r = requests.get("http://web.stanford.edu/class/cs21si/resources/unit3_resources.zip")
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

data = pd.read_csv("unit3_resources/compas-scores.csv", header = 0)

# Select fields we want.
fields_of_interest = ['name', 'sex', 'age', 'race', 'priors_count', 'c_charge_desc', 
                      'v_decile_score', 'decile_score', 'is_violent_recid', 'is_recid']
data = data[fields_of_interest]
# More interpretable column names.
data.columns = ['name', 'sex', 'age', 'race', 'num_priors', 'charge', 
                'violence_score', 'recidivism_score', 'violence_true', 'recidivism_true']

# Remove records with missing scores.
data = data.loc[(data.violence_score != -1) & (data.recidivism_score != -1)]
data = data.loc[(data.violence_true != -1) & (data.recidivism_true != -1)]

# Convert strings to numerical values.
sex_classes = {'Male': 0, 'Female' : 1}

processed_data = data.copy()
processed_data['sex'] = data['sex'].apply(lambda x: sex_classes[x])

# One-hot encode race.
processed_data = pd.get_dummies(processed_data, columns = ['race'])
columns = processed_data.columns.tolist()
columns = columns[0:3] + columns[9:] + columns[3:9]
processed_data = processed_data.reindex(columns = columns)

processed_data.head()

Unnamed: 0,name,sex,age,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,num_priors,charge,violence_score,recidivism_score,violence_true,recidivism_true
0,miguel hernandez,0,69,0,0,0,0,0,1,0,Aggravated Assault w/Firearm,1,1,0,0
2,kevon dixon,0,34,1,0,0,0,0,0,0,Felony Battery w/Prior Convict,1,3,1,1
3,ed philo,0,24,1,0,0,0,0,0,4,Possession of Cocaine,3,4,0,1
4,marcu brown,0,23,1,0,0,0,0,0,1,Possession of Cannabis,6,8,0,0
5,bouthy pierrelouis,0,43,0,0,0,0,0,1,2,arrest case no charge,1,1,0,0


In [None]:
# Convert pandas dataframe to numpy array for easier processing.
processed_data = processed_data.values

## Partition into Train and Test Sets


In [None]:
# split into input (X) and output (Y) variables
X = processed_data[:,1:10].astype('float32') # sex, age, race, num_priors
y = processed_data[:,14].astype('float32') # recidivism_true

num_train = int(math.ceil(X.shape[0]*0.8))
num_test = int(math.floor(X.shape[0]*0.2))

X_train = X[:num_train]
y_train = y[:num_train]

X_test = X[num_train:]
y_test = y[num_train:]

num_classes = 2
# convert class vectors to binary class matrices
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

print(X_train.shape[0], 'records in train set')
print(X_test.shape[0], 'records in test set')
print(X.shape[0], 'records in total')

8822 records in train set
2205 records in test set
11027 records in total


In [None]:
X_train.shape

(8822, 9)

## Set up our Evaluation Pipeline


In [None]:
#########################################################
# Trains and evaluates given model. Returns loss and 
# accuracy.
#########################################################
def eval(model, verb = 2):
    # fit the model
    model.fit(X_train, y_train, 
              epochs = 30, 
              batch_size = batch_size,          
              validation_split = 0.1,
              verbose = verb,
              shuffle = False)
    
    # Evaluate the model.
    scores = model.evaluate(X_test, y_test)
    
    return scores

In [None]:
batch_size = 64
num_classes = 2

learning_rate = 2e-3
reg_strength = 1e-4

#########################################################
# Initializes neural network with dropout.
#########################################################
def nn_classifier(learning_rate, reg_strength, dropout_strength=0.5):
    # create model
    model = Sequential()

    model.add(Dropout(dropout_strength, input_shape = (X.shape[1],)))
    model.add(Dense(50, activation = 'relu')) 
    model.add(BatchNormalization())
    model.add(Dense(100, activation = 'relu')) 
    model.add(Dense(50, activation = 'relu')) 
    model.add(Dense(num_classes, activation = 'softmax'))
    # Add a layer to model which has:
    # Input size: 9; and output size: 1
    # model.add(Dense(10, input_dim=9, activation = 'relu'))# kernel_regularizer=regularizers.l2(reg_strength)))
    # model.add(Dense(num_classes, activation='softmax'))

    # compile model
    sgd = tf.keras.optimizers.SGD(lr = learning_rate)
    model.compile(loss = keras.losses.categorical_crossentropy, 
                  optimizer = sgd, metrics=['accuracy'])
    
    return model

# Evaluate your model
for learning_rate in [1e-2]:
  for reg_strength in [1e-4]:
    print("Using learning rate %f and regularization strength %f..." % (learning_rate, reg_strength))
    model = nn_classifier(learning_rate, reg_strength)
    loss, acc = eval(model, verb = 2)
    print('\n\nTest loss:', loss)
    print('Test accuracy:', acc)

Using learning rate 0.010000 and regularization strength 0.000100...
Epoch 1/30


  super(SGD, self).__init__(name, **kwargs)


125/125 - 1s - loss: 0.6361 - accuracy: 0.6555 - val_loss: 0.6309 - val_accuracy: 0.6954 - 1s/epoch - 10ms/step
Epoch 2/30
125/125 - 0s - loss: 0.6188 - accuracy: 0.6706 - val_loss: 0.6379 - val_accuracy: 0.6840 - 270ms/epoch - 2ms/step
Epoch 3/30
125/125 - 0s - loss: 0.6199 - accuracy: 0.6702 - val_loss: 0.6384 - val_accuracy: 0.6818 - 278ms/epoch - 2ms/step
Epoch 4/30
125/125 - 0s - loss: 0.6199 - accuracy: 0.6738 - val_loss: 0.6452 - val_accuracy: 0.6942 - 262ms/epoch - 2ms/step
Epoch 5/30
125/125 - 0s - loss: 0.6206 - accuracy: 0.6690 - val_loss: 0.6435 - val_accuracy: 0.6942 - 253ms/epoch - 2ms/step
Epoch 6/30
125/125 - 0s - loss: 0.6199 - accuracy: 0.6720 - val_loss: 0.6433 - val_accuracy: 0.6931 - 243ms/epoch - 2ms/step
Epoch 7/30
125/125 - 0s - loss: 0.6198 - accuracy: 0.6741 - val_loss: 0.6419 - val_accuracy: 0.6874 - 277ms/epoch - 2ms/step
Epoch 8/30
125/125 - 0s - loss: 0.6186 - accuracy: 0.6717 - val_loss: 0.6584 - val_accuracy: 0.6908 - 272ms/epoch - 2ms/step
Epoch 9/30
12

## Part 2: Hyperparameter Tuning


In [None]:
def tune_hyperparams():
    best_model = (None, None, None)
    running_best_accuracy = 0

    # Play with these!
    learning_rate = [1e-1, 2e-1, 3e-1 ]
    reg_strength = [3e-4, 35e-5, 4e-4] 
    
    for lr in learning_rate:
        for reg in reg_strength:
            model = nn_classifier(lr, reg)
            model_loss, model_acc = eval(model, verb = 0)

            print('\n val_acc: {:f}, lr: {:f}, reg: {:f}\n'.format(
                    model_acc, lr, reg))

            if model_acc > running_best_accuracy:
                model_params = {"lr": lr, "reg": reg}
                best_model = (model, model_acc, model_params)
                running_best_accuracy = model_acc
            
    return best_model
        
best_model = tune_hyperparams()
print("\n\nBest Model Performance: ", best_model[1])
print("Hyperparameters of Best Model: ", best_model[2])

  super(SGD, self).__init__(name, **kwargs)



 val_acc: 0.680272, lr: 0.100000, reg: 0.000300


 val_acc: 0.679365, lr: 0.100000, reg: 0.000350


 val_acc: 0.683900, lr: 0.100000, reg: 0.000400


 val_acc: 0.559637, lr: 0.200000, reg: 0.000300


 val_acc: 0.676644, lr: 0.200000, reg: 0.000350


 val_acc: 0.597732, lr: 0.200000, reg: 0.000400


 val_acc: 0.682993, lr: 0.300000, reg: 0.000300


 val_acc: 0.678005, lr: 0.300000, reg: 0.000350


 val_acc: 0.683447, lr: 0.300000, reg: 0.000400



Best Model Performance:  0.6839002370834351
Hyperparameters of Best Model:  {'lr': 0.1, 'reg': 0.0004}
