# Recidivism Prediction

## Importing Libraries and Dataset

In [1]:
# Import Libraries
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine
import pymysql
import psycopg2 as pg

## Database Connection

In [2]:
# Connecting to Amazon RDS Database
engine = pg.connect("host='database-1.clf82frcjuur.ca-central-1.rds.amazonaws.com' dbname='postgres' user='postgres' password='Database_1' port='5432'")

In [3]:
# Reading PRISONERS Table to Pandas DataFrame
prisoner_df = pd.read_sql_query('SELECT * FROM PRISONERS', con = engine)

In [4]:
# Viewing prisoner_df
prisoner_df.head()

Unnamed: 0,ID,Year Released,Release Type,Age,Race,Sex
0,12471,2013,Paroled to Detainer,45-54,White,Male
1,12480,2013,Paroled to Detainer,Under 25,White,Male
2,12486,2013,Paroled to Detainer,25-34,White,Male
3,12659,2013,Paroled to Detainer,25-34,White,Male
4,12683,2013,Paroled to Detainer,35-44,White,Male


In [5]:
# Reading OFFENSE Table to Pandas Dataframe
offense_df = pd.read_sql_query('SELECT * FROM OFFENSE', con = engine)

In [6]:
# Viewing offense_df
offense_df.head()

Unnamed: 0,ID,Offense Classification,Offense Type,Offense Subtype
0,1,C Felony,Violent,Robbery
1,2,D Felony,Property,Theft
2,3,B Felony,Drug,Trafficking
3,4,B Felony,Other,Other Criminal
4,5,D Felony,Violent,Assault


In [7]:
# Reading RECIDIVISM Table to Pandas Dataframe
recidivism_df = pd.read_sql_query('SELECT * FROM RECIDIVISM', con = engine)

In [8]:
# Viewing recidivism_df
recidivism_df.head()

Unnamed: 0,ID,Days to Return,Recidivism Type,New Offense Classification,New Offense Type,New Offense Sub Type,Target Population,Return to Prison
0,1,433.0,New,C Felony,Drug,Trafficking,Yes,Yes
1,2,453.0,Tech,,,,No,Yes
2,3,832.0,Tech,,,,Yes,Yes
3,4,,No Recidivism,,,,Yes,No
4,5,116.0,Tech,,,,No,Yes


### Combining Tables to One Dataframe for ML

Removed columns related to New Offense Commited (Everything in RECIDIVISM table except Return to Prison. Offense Subtype was also not included, as it would have created far too many features for the Machine Learning model. Prisoner ID was also dropped because it is a unique column.

In [19]:
df = pd.read_sql_query("""SELECT p."Year Released", p."Race", p."Age", o."Offense Classification", o."Offense Type", p."Release Type", r."Return to Prison" from PRISONERS p INNER JOIN OFFENSE o on p."ID" = o."ID" INNER JOIN RECIDIVISM r on p."ID" = r."ID" ;""", con = engine)

In [21]:
df.head()

Unnamed: 0,Year Released,Race,Age,Offense Classification,Offense Type,Release Type,Return to Prison
0,2013,White,45-54,Felony - Enhancement to Original Penalty,Other,Paroled to Detainer,Yes
1,2013,White,Under 25,D Felony,Drug,Paroled to Detainer,No
2,2013,White,25-34,C Felony,Violent,Paroled to Detainer,No
3,2013,White,25-34,D Felony,Property,Paroled to Detainer,No
4,2013,White,35-44,Aggravated Misdemeanor,Public Order,Paroled to Detainer,Yes


## Exploring Dataset

In [22]:
# Data Types of All Variables in DataFrame
df.dtypes

Year Released              int64
Race                      object
Age                       object
Offense Classification    object
Offense Type              object
Release Type              object
Return to Prison          object
dtype: object

In [23]:
df['Return to Prison'].value_counts()

No     17339
Yes     8681
Name: Return to Prison, dtype: int64

## Data Preprocessing

From Segment_2_Michael/Recidivism_Cleaning.ipynb

In [33]:
# Replacing categorical variables with dummies
encoded_df = pd.get_dummies(df.drop(columns=["Year Released", "Return to Prison"], axis=1))
encoded_df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Offense Classification_Sexual Predator Community Supervision,Offense Classification_Simple Misdemeanor,Offense Classification_Special Sentence 2005,Offense Type_Drug,Offense Type_Other,Offense Type_Property,Offense Type_Public Order,Offense Type_Violent,Release Type_Others,Release Type_Paroled to Detainer
0,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [34]:
# Added back "Year Released" column
encoded_df['Year Released'] = df['Year Released']
encoded_df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Offense Classification_Simple Misdemeanor,Offense Classification_Special Sentence 2005,Offense Type_Drug,Offense Type_Other,Offense Type_Property,Offense Type_Public Order,Offense Type_Violent,Release Type_Others,Release Type_Paroled to Detainer,Year Released
0,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,2013
1,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,2013
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,2013
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,2013
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,2013


In [35]:
# Add back "Return to Prison" column but with 1 as Yes and 0 as No
encoded_df['Return to Prison'] = df['Return to Prison'].map({'Yes': 1, 'No': 0})
encoded_df.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Offense Classification_Special Sentence 2005,Offense Type_Drug,Offense Type_Other,Offense Type_Property,Offense Type_Public Order,Offense Type_Violent,Release Type_Others,Release Type_Paroled to Detainer,Year Released,Return to Prison
0,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,2013,1
1,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,1,2013,0
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,1,2013,0
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,1,2013,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,1,2013,1


In [36]:
# Check of Value_counts for "Return to Prison"
encoded_df['Return to Prison'].value_counts()

0    17339
1     8681
Name: Return to Prison, dtype: int64

## Machine Learning

### Setting Target and Feature Variables

In [37]:
# Setting 'Recidivism' as the target variable for the machine learning algorithm
y = encoded_df["Return to Prison"].values.reshape(-1, 1)

In [39]:
# Features
X = encoded_df.copy()
X = X.drop("Return to Prison", axis = 1)
X.head()

Unnamed: 0,Race_Asian,Race_Black,Race_Hispanic,Race_Native,Race_White,Age_25-34,Age_35-44,Age_45-54,Age_55 and Older,Age_Under 25,...,Offense Classification_Simple Misdemeanor,Offense Classification_Special Sentence 2005,Offense Type_Drug,Offense Type_Other,Offense Type_Property,Offense Type_Public Order,Offense Type_Violent,Release Type_Others,Release Type_Paroled to Detainer,Year Released
0,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,2013
1,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,2013
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,2013
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,2013
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,2013


### Splitting Training and Testing Data

In [40]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, test_size=0.2)

In [41]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20816, 33)
(5204, 33)
(20816, 1)
(5204, 1)


In [42]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [43]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [44]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)