# Predicting if someone arrested will be released

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import statsmodels.api as sm
import statsmodels.formula.api as smf

from matplotlib import pyplot as plt
from itertools import cycle
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler, label_binarize
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster         import KMeans, AgglomerativeClustering
from sklearn.linear_model import LogisticRegression
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from sklearn.multiclass import OneVsRestClassifier

In [3]:
arrests = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/carData/Arrests.csv")
arrests= arrests.iloc[:,1:]

arrests

Unnamed: 0,released,colour,year,age,sex,employed,citizen,checks
0,Yes,White,2002,21,Male,Yes,Yes,3
1,No,Black,1999,17,Male,Yes,Yes,3
2,Yes,White,2000,24,Male,Yes,Yes,3
3,No,Black,2000,46,Male,Yes,Yes,1
4,Yes,Black,1999,27,Female,Yes,Yes,1
...,...,...,...,...,...,...,...,...
5221,Yes,White,2000,17,Male,Yes,Yes,0
5222,Yes,White,2000,21,Female,Yes,Yes,0
5223,Yes,Black,1999,21,Female,Yes,Yes,1
5224,No,Black,1998,24,Male,Yes,Yes,4


# Data Pre-Processing

In [4]:
arrests.isnull().sum()

released    0
colour      0
year        0
age         0
sex         0
employed    0
citizen     0
checks      0
dtype: int64

In [5]:
arrests["released"]=np.where(arrests["released"]=="Yes", 1, 0)
arrests

Unnamed: 0,released,colour,year,age,sex,employed,citizen,checks
0,1,White,2002,21,Male,Yes,Yes,3
1,0,Black,1999,17,Male,Yes,Yes,3
2,1,White,2000,24,Male,Yes,Yes,3
3,0,Black,2000,46,Male,Yes,Yes,1
4,1,Black,1999,27,Female,Yes,Yes,1
...,...,...,...,...,...,...,...,...
5221,1,White,2000,17,Male,Yes,Yes,0
5222,1,White,2000,21,Female,Yes,Yes,0
5223,1,Black,1999,21,Female,Yes,Yes,1
5224,0,Black,1998,24,Male,Yes,Yes,4


# Model Building and Evaluation

In [6]:
formula = "released ~ C(colour) + year + age + C(sex) + C(employed) + C(citizen) + checks"
formula2 = "released ~ C(colour) + year + age + checks"

model = smf.logit(formula = formula2, data = arrests)
lr = model.fit()
print(lr.summary())

Optimization terminated successfully.
         Current function value: 0.422013
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               released   No. Observations:                 5226
Model:                          Logit   Df Residuals:                     5221
Method:                           MLE   Df Model:                            4
Date:                Thu, 19 Aug 2021   Pseudo R-squ.:                 0.07650
Time:                        21:52:24   Log-Likelihood:                -2205.4
converged:                       True   LL-Null:                       -2388.1
Covariance Type:            nonrobust   LLR p-value:                 8.370e-78
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept            -90.4253     54.413     -1.662      0.097    -197.073      16.222
C(col

#### The Pseudo R-squared shows only about 10% of the variance is explained by this model.  
#### Colour, employed, citizen and checks are all significant with p < 0.05.

In [7]:
coef=lr.params
np.exp(coef)

# logistic regression coefficients
results = pd.DataFrame(lr.params, columns=["coef"]) 
results["exp_coef"]=np.exp(lr.params)
results

Unnamed: 0,coef,exp_coef
Intercept,-90.425278,5.355507e-40
C(colour)[T.White],0.544535,1.723807
year,0.046256,1.047343
age,-0.003298,0.9967076
checks,-0.400655,0.6698813


##### The coeeficients of the significant predictors of the logistic regression model can be interpreted as follows:
##### The colour(white) coefficient is .389109, meaning that if every other factor assumes the referenced value (i.e.: black, female, unemployed, non-citizen) and the continuous variables (year, age, and checks) are held constant, being white increases the log odds of arrest by 0.389109, and therefore the odds of being arrested would increase by 1.475665.
##### The employed(Yes) coefficient is 0.757302, meaning that if every other factor assumes the referenced value (i.e.: black, female, unemployed, non-citizen) and the continuous variables (year, age, and checks) are held constant, being employed increases the log odds of arrest by 0.757302, and therefore the odds of being arrested would increase by 2.132515.
##### The citizen(Yes) coefficient is 0.576519, meaning that if every other factor assumes the referenced value (i.e.: black, female, unemployed, non-citizen) and the continuous variables (year, age, and checks) are held constant, being employed increases the log odds of arrest by 0.576519, and therefore the odds of being arrested would increase by 1.779833.
##### The checks coefficient is -0.364101, meaning that if every other factor is held constant and checks is increased by 1, the log odds would decrease by 0.364101, and therefore the odds of being admitted would increase by exp(-0.364101) = 0.694821, or decrease by 0.305179.
##### The other variables (sex, year, age) are not significant.