# Recidivision Prediction: Machine Learning Optimization

## Importing Libraries and Dataset

In [1]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf

In [2]:
# Reading the clean data csv file
df = pd.read_csv('Resources/Recidivism_Cleaned.csv', index_col=0)
df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Offense Type_Property,Offense Type_Public Order,Offense Type_Violent,Release Type_Discharged End of Sentence,Release Type_Other,Release Type_Parole,Release Type_Paroled to Detainer,Release Type_Special Sentence,Year Released,Recidivism
0,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,2010,1
1,0,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,2010,1
2,0,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,1,0,0,2010,1
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,2010,1
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,2010,1


In [3]:
# All Columns
df.columns

Index(['Race_Asian', 'Race_Black', 'Race_Hispanic', 'Race_Native',
       'Race_White', 'Age_25-34', 'Age_35-44', 'Age_45-54', 'Age_55 and Older',
       'Age_Under 25', 'Offense Classification_A Felony',
       'Offense Classification_Aggravated Misdemeanor',
       'Offense Classification_B Felony', 'Offense Classification_C Felony',
       'Offense Classification_D Felony',
       'Offense Classification_Felony - Enhanced',
       'Offense Classification_Felony - Enhancement to Original Penalty',
       'Offense Classification_Felony - Mandatory Minimum',
       'Offense Classification_Other Felony',
       'Offense Classification_Other Felony (Old Code)',
       'Offense Classification_Other Misdemeanor',
       'Offense Classification_Serious Misdemeanor',
       'Offense Classification_Sexual Predator Community Supervision',
       'Offense Classification_Simple Misdemeanor',
       'Offense Classification_Special Sentence 2005', 'Offense Type_Drug',
       'Offense Type_Other', 

In [4]:
# From feature importance checked in the Random Forest Classifier, it was seen that Year Released contributes a lot while
# the offence classification types generally contributed the least.

# To help optimize the model the Year Released will be dropped because
# it could be overfitting the data and not be general enough for points outside of the years in the dataset.

df.drop(columns = ['Year Released'], inplace = True)

## Machine Learning: Neural Network

### Splitting Data and Scaling

In [5]:
# Split our preprocessed data into our features and target arrays
y = df.Recidivism.values
X = df.drop(columns=["Recidivism"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, stratify=y, test_size=0.2)

In [6]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 40
hidden_nodes_layer3 = 20

nn_1 = tf.keras.models.Sequential()

# First hidden layer
nn_1.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="sigmoid"))

# Second hidden layer
nn_1.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn_1.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn_1.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                2880      
                                                                 
 dense_1 (Dense)             (None, 40)                3240      
                                                                 
 dense_2 (Dense)             (None, 20)                820       
                                                                 
 dense_3 (Dense)             (None, 1)                 21        
                                                                 
Total params: 6,961
Trainable params: 6,961
Non-trainable params: 0
_________________________________________________________________


In [8]:
# Compile the model
nn_1.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [9]:
# Train the model
fit_model = nn_1.fit(X_train_scaled,y_train,epochs=120)

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120


In [10]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_1.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

163/163 - 0s - loss: 0.6361 - accuracy: 0.6663 - 256ms/epoch - 2ms/step
Loss: 0.6361142992973328, Accuracy: 0.6663459539413452
