# Recidivism Prediction

## Importing Libraries and Dataset

In [25]:
# Import Libraries
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine
import pymysql
import psycopg2 as pg

## Database Connection

In [2]:
# Connecting to Amazon RDS Database
engine = pg.connect("host='database-1.clf82frcjuur.ca-central-1.rds.amazonaws.com' dbname='postgres' user='postgres' password='Database_1' port='5432'")

In [3]:
# Reading PRISONERS Table to Pandas DataFrame
prisoner_df = pd.read_sql_query('SELECT * FROM PRISONERS', con = engine)

In [4]:
# Viewing prisoner_df
prisoner_df.head()

Unnamed: 0,ID,Year Released,Release Type,Age,Race,Sex
0,1,2010,Parole,25-34,Black,Male
1,2,2010,Discharged – End of Sentence,25-34,White,Male
2,3,2010,Parole,35-44,White,Male
3,4,2010,Parole,25-34,White,Male
4,5,2010,Discharged – End of Sentence,35-44,Black,Male


In [5]:
# Reading OFFENSE Table to Pandas Dataframe
offense_df = pd.read_sql_query('SELECT * FROM OFFENSE', con = engine)

In [6]:
# Viewing offense_df
offense_df.head()

Unnamed: 0,ID,Offense Classification,Offense Type
0,1,C Felony,Violent
1,2,D Felony,Property
2,3,B Felony,Drug
3,4,B Felony,Other
4,5,D Felony,Violent


In [7]:
# Reading RECIDIVISM Table to Pandas Dataframe
recidivism_df = pd.read_sql_query('SELECT * FROM RECIDIVISM', con = engine)

In [8]:
# Viewing recidivism_df
recidivism_df.head()

Unnamed: 0,ID,Days to Return,Recidivism Type,New Offense Classification,New Offense Type,Return to Prison
0,1,433.0,New,C Felony,Drug,Yes
1,2,453.0,Tech,,,Yes
2,3,832.0,Tech,,,Yes
3,4,,No Recidivism,,,No
4,5,116.0,Tech,,,Yes


### Combining Tables to One Dataframe for ML

Removed columns related to New Offense Commited (Everything in RECIDIVISM table except Return to Prison. Offense Subtype was also not included, as it would have created far too many features for the Machine Learning model. Prisoner ID was also dropped because it is a unique column.

In [9]:
df = pd.read_sql_query("""SELECT p."Year Released", p."Race", p."Age", o."Offense Classification", o."Offense Type", p."Release Type", r."Return to Prison" from PRISONERS p INNER JOIN OFFENSE o on p."ID" = o."ID" INNER JOIN RECIDIVISM r on p."ID" = r."ID" ;""", con = engine)

In [10]:
df.head()

Unnamed: 0,Year Released,Race,Age,Offense Classification,Offense Type,Release Type,Return to Prison
0,2010,Black,25-34,C Felony,Violent,Parole,Yes
1,2010,White,25-34,D Felony,Property,Discharged – End of Sentence,Yes
2,2010,White,35-44,B Felony,Drug,Parole,Yes
3,2010,White,25-34,B Felony,Other,Parole,No
4,2010,Black,35-44,D Felony,Violent,Discharged – End of Sentence,Yes


## Exploring Dataset

In [11]:
# Data Types of All Variables in DataFrame
df.dtypes

Year Released              int64
Race                      object
Age                       object
Offense Classification    object
Offense Type              object
Release Type              object
Return to Prison          object
dtype: object

In [12]:
df['Return to Prison'].value_counts()

No     17304
Yes     8681
Name: Return to Prison, dtype: int64

## Data Preprocessing

From Segment_2_Michael/Recidivism_Cleaning.ipynb

In [13]:
# Replacing categorical variables with dummies
encoded_df = pd.get_dummies(df.drop(columns=["Year Released", "Return to Prison"], axis=1))
encoded_df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Offense Type_Violent,Release Type_Discharged - Expiration of Sentence,Release Type_Discharged – End of Sentence,Release Type_Other,Release Type_Parole,Release Type_Parole Granted,Release Type_Paroled to Detainer,Release Type_Paroled w/Immediate Discharge,Release Type_Released to Special Sentence,Release Type_Special Sentence
0,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [14]:
# Added back "Year Released" column
encoded_df['Year Released'] = df['Year Released']
encoded_df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Release Type_Discharged - Expiration of Sentence,Release Type_Discharged – End of Sentence,Release Type_Other,Release Type_Parole,Release Type_Parole Granted,Release Type_Paroled to Detainer,Release Type_Paroled w/Immediate Discharge,Release Type_Released to Special Sentence,Release Type_Special Sentence,Year Released
0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2010
1,0,0,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,2010
2,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,2010
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2010
4,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,2010


In [15]:
# Add back "Return to Prison" column but with 1 as Yes and 0 as No
encoded_df['Return to Prison'] = df['Return to Prison'].map({'Yes': 1, 'No': 0})
encoded_df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Release Type_Discharged – End of Sentence,Release Type_Other,Release Type_Parole,Release Type_Parole Granted,Release Type_Paroled to Detainer,Release Type_Paroled w/Immediate Discharge,Release Type_Released to Special Sentence,Release Type_Special Sentence,Year Released,Return to Prison
0,0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,2010,1
1,0,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,2010,1
2,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,2010,1
3,0,0,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,2010,0
4,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,2010,1


In [16]:
# Check of Value_counts for "Return to Prison"
encoded_df['Return to Prison'].value_counts()

0    17304
1     8681
Name: Return to Prison, dtype: int64

## Machine Learning

### Setting Target and Feature Variables

In [17]:
# Setting 'Recidivism' as the target variable for the machine learning algorithm
y = encoded_df["Return to Prison"].values.reshape(-1, 1)

In [18]:
# Features
X = encoded_df.copy()
X = X.drop("Return to Prison", axis = 1)
X.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Release Type_Discharged - Expiration of Sentence,Release Type_Discharged – End of Sentence,Release Type_Other,Release Type_Parole,Release Type_Parole Granted,Release Type_Paroled to Detainer,Release Type_Paroled w/Immediate Discharge,Release Type_Released to Special Sentence,Release Type_Special Sentence,Year Released
0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2010
1,0,0,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,2010
2,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,2010
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2010
4,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,2010


### Splitting Training and Testing Data

In [19]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, test_size=0.2)

In [20]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20788, 40)
(5197, 40)
(20788, 1)
(5197, 1)


In [21]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [22]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [23]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Fitting the Decision Tree Model

In [26]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [27]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

### Making Predictions Using the Model

In [28]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

### Model Evaluation

In [29]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [30]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2990,503
Actual 1,1318,386


Accuracy Score : 0.6496055416586493
Classification Report
              precision    recall  f1-score   support

           0       0.69      0.86      0.77      3493
           1       0.43      0.23      0.30      1704

    accuracy                           0.65      5197
   macro avg       0.56      0.54      0.53      5197
weighted avg       0.61      0.65      0.61      5197

