# Recidivision Prediction: Machine Learning Optimization

## Importing Libraries and Dataset

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf

In [2]:
# Reading the clean data csv file
df = pd.read_csv('Resources/Recidivism_Cleaned.csv', index_col=0)
df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Offense Type_Property,Offense Type_Public Order,Offense Type_Violent,Release Type_Discharged End of Sentence,Release Type_Other,Release Type_Parole,Release Type_Paroled to Detainer,Release Type_Special Sentence,Year Released,Recidivism
0,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,2010,1
1,0,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,2010,1
2,0,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,1,0,0,2010,1
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,2010,1
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,2010,1


## Machine Learning: Random Forest

### Splitting Data and Scaling

In [4]:
# Split our preprocessed data into our features and target arrays, also dropped Year Released to make the model more general
y = df.Recidivism
X = df.drop(columns=["Recidivism"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, stratify=y, test_size=0.2)

In [5]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Model

In [6]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [7]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [8]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [9]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2942,519
Actual 1,1328,408


In [10]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [11]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2942,519
Actual 1,1328,408


Accuracy Score : 0.6446026553781028
Classification Report
              precision    recall  f1-score   support

           0       0.69      0.85      0.76      3461
           1       0.44      0.24      0.31      1736

    accuracy                           0.64      5197
   macro avg       0.56      0.54      0.53      5197
weighted avg       0.61      0.64      0.61      5197



In [12]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([9.14098472e-03, 3.74122409e-02, 2.57830948e-02, 1.99667664e-02,
       3.95307398e-02, 3.47762090e-02, 3.18477427e-02, 2.82668340e-02,
       2.52225920e-02, 3.36013058e-02, 2.06055823e-04, 3.21966958e-02,
       1.75658924e-02, 3.00930207e-02, 3.35878114e-02, 6.43390390e-03,
       1.49705236e-02, 1.17258617e-03, 6.95077283e-04, 1.55480661e-05,
       1.70648793e-05, 6.88104941e-03, 1.00760368e-04, 1.48603869e-05,
       5.51059385e-03, 2.78367659e-02, 1.77859444e-02, 2.82166985e-02,
       2.22474747e-02, 3.33974027e-02, 3.01605959e-02, 1.90943516e-02,
       4.16146508e-02, 8.10411306e-03, 2.14676034e-02, 3.15064445e-01])

In [13]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.31506444493442526, 'Year Released'),
 (0.041614650800273593, 'Release Type_Parole'),
 (0.03953073978985592, 'Race_White'),
 (0.03741224094652659, 'Race_Black'),
 (0.03477620896912591, 'Age_25-34'),
 (0.033601305754229205, 'Age_Under 25'),
 (0.03358781141985302, 'Offense Classification_D Felony'),
 (0.0333974027153254, 'Offense Type_Violent'),
 (0.03219669581626255, 'Offense Classification_Aggravated Misdemeanor'),
 (0.031847742667953254, 'Age_35-44'),
 (0.03016059591665257, 'Release Type_Discharged End of Sentence'),
 (0.030093020651878688, 'Offense Classification_C Felony'),
 (0.028266833973299033, 'Age_45-54'),
 (0.028216698486691085, 'Offense Type_Property'),
 (0.02783676591343954, 'Offense Type_Drug'),
 (0.02578309479717107, 'Race_Hispanic'),
 (0.025222592035642175, 'Age_55 and Older'),
 (0.022247474687413244, 'Offense Type_Public Order'),
 (0.0214676034431688, 'Release Type_Special Sentence'),
 (0.019966766392880992, 'Race_Native'),
 (0.019094351607100293, 'Release Type_Other'