# Recidivision Prediction: Machine Learning Optimization

## Importing Libraries and Dataset

In [14]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf

In [3]:
# Reading the clean data csv file
df = pd.read_csv('Resources/Recidivism_Cleaned.csv', index_col=0)
df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Offense Type_Property,Offense Type_Public Order,Offense Type_Violent,Release Type_Discharged End of Sentence,Release Type_Other,Release Type_Parole,Release Type_Paroled to Detainer,Release Type_Special Sentence,Year Released,Recidivism
0,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,2010,1
1,0,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,2010,1
2,0,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,1,0,0,2010,1
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,2010,1
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,2010,1


In [43]:
# Columns in Dataset
df.columns

Index(['Race_Asian', 'Race_Black', 'Race_Hispanic', 'Race_Native',
       'Race_White', 'Age_25-34', 'Age_35-44', 'Age_45-54', 'Age_55 and Older',
       'Age_Under 25', 'Offense Classification_A Felony',
       'Offense Classification_Aggravated Misdemeanor',
       'Offense Classification_B Felony', 'Offense Classification_C Felony',
       'Offense Classification_D Felony',
       'Offense Classification_Felony - Enhanced',
       'Offense Classification_Felony - Enhancement to Original Penalty',
       'Offense Classification_Felony - Mandatory Minimum',
       'Offense Classification_Other Felony',
       'Offense Classification_Other Felony (Old Code)',
       'Offense Classification_Other Misdemeanor',
       'Offense Classification_Serious Misdemeanor',
       'Offense Classification_Sexual Predator Community Supervision',
       'Offense Classification_Simple Misdemeanor',
       'Offense Classification_Special Sentence 2005', 'Offense Type_Drug',
       'Offense Type_Other', 

In [44]:
# Dropping Offence Classification Columns and Year Released Columns
# Year Released: Because this will allow the model to be more general rather than tying it to a year
# Offence Classification: This is a classification done by the court, and on a prisoner basis it might not contribute much to a convict's liklihood of recommiting a crime

df.drop(columns=['Year Released', 'Offense Classification_A Felony',
       'Offense Classification_Aggravated Misdemeanor',
       'Offense Classification_B Felony', 'Offense Classification_C Felony',
       'Offense Classification_D Felony',
       'Offense Classification_Felony - Enhanced',
       'Offense Classification_Felony - Enhancement to Original Penalty',
       'Offense Classification_Felony - Mandatory Minimum',
       'Offense Classification_Other Felony',
       'Offense Classification_Other Felony (Old Code)',
       'Offense Classification_Other Misdemeanor',
       'Offense Classification_Serious Misdemeanor',
       'Offense Classification_Sexual Predator Community Supervision',
       'Offense Classification_Simple Misdemeanor',
       'Offense Classification_Special Sentence 2005'], inplace = True)

## Machine Learning: Random Forest

### Splitting Data and Scaling

In [46]:
# Split our preprocessed data into our features and target arrays, also dropped Year Released to make the model more general
y = df.Recidivism
X = df.drop(columns=["Recidivism"])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, stratify=y, test_size=0.2)

In [47]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Model

In [48]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [49]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [50]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [51]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3340,121
Actual 1,1635,101


In [52]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [53]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3340,121
Actual 1,1635,101


Accuracy Score : 0.6621127573600154
Classification Report
              precision    recall  f1-score   support

           0       0.67      0.97      0.79      3461
           1       0.45      0.06      0.10      1736

    accuracy                           0.66      5197
   macro avg       0.56      0.51      0.45      5197
weighted avg       0.60      0.66      0.56      5197



In [54]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.01646297, 0.04142832, 0.05478925, 0.03559421, 0.04527872,
       0.04444153, 0.03505779, 0.04461293, 0.06043962, 0.05530215,
       0.03692933, 0.03897954, 0.03825366, 0.0388803 , 0.07043425,
       0.08481628, 0.06094647, 0.11459565, 0.01992377, 0.06283325])

In [55]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.11459565473352971, 'Release Type_Parole'),
 (0.08481627772250461, 'Release Type_Discharged End of Sentence'),
 (0.0704342468431495, 'Offense Type_Violent'),
 (0.06283325374084335, 'Release Type_Special Sentence'),
 (0.06094647375728804, 'Release Type_Other'),
 (0.060439620594062674, 'Age_55 and Older'),
 (0.05530214870232926, 'Age_Under 25'),
 (0.05478925453270331, 'Race_Hispanic'),
 (0.04527872375673427, 'Race_White'),
 (0.04461293113608317, 'Age_45-54'),
 (0.04444153379324165, 'Age_25-34'),
 (0.04142831780211931, 'Race_Black'),
 (0.03897953789857107, 'Offense Type_Other'),
 (0.038880296326915045, 'Offense Type_Public Order'),
 (0.038253658972202106, 'Offense Type_Property'),
 (0.03692933212284219, 'Offense Type_Drug'),
 (0.0355942106526397, 'Race_Native'),
 (0.03505778801596355, 'Age_35-44'),
 (0.019923773694763974, 'Release Type_Paroled to Detainer'),
 (0.016462965201513594, 'Race_Asian')]