In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, log_loss, roc_auc_score

# Logistic regression (Classification of general recidivism)

## Load dataset

In [2]:
df = pd.read_csv("../../data-cleaned/compas-scores-two-years-clean.csv")

ProPublicas reasoning for filter `raw_data`:
>However not all of the rows are useable for the first round of analysis.
>There are a number of reasons remove rows because of missing data:
>* If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense.
>* We coded the recidivist flag -- `is_recid` -- to be -1 if we could not find a compas case at all.
>* In a similar vein, ordinary traffic offenses -- those with a `c_charge_degree` of 'O' -- will not result in Jail time are removed (only two of them).
>* We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.

## Filter dataset

In [3]:
df = df[
        (df["days_b_screening_arrest"]<=30) 
        & (df["days_b_screening_arrest"]>=-30) 
        & (df["is_recid"]!=-1) 
        & (df["c_charge_degree"]!="O") 
      ].dropna(subset=['score_text'])

In [4]:
# Select columns of interest and store as new df
columns = ["id", "age", "age_cat", "sex", "race",  #demographics
           "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", #criminal history
           "c_charge_degree", "c_charge_desc", #current charges
           "score_text", "decile_score", #compas results for current charges
           "r_charge_degree", "r_charge_desc", #commited crimes (general) within 2 years after COMPAS scoring 
           "two_year_recid"] #truth

df_clean = df[columns]

## Check values and dtypes of columns

In [5]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5304 entries, 0 to 6215
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               5304 non-null   int64 
 1   age              5304 non-null   int64 
 2   age_cat          5304 non-null   object
 3   sex              5304 non-null   object
 4   race             5304 non-null   object
 5   juv_fel_count    5304 non-null   int64 
 6   juv_misd_count   5304 non-null   int64 
 7   juv_other_count  5304 non-null   int64 
 8   priors_count     5304 non-null   int64 
 9   c_charge_degree  5304 non-null   object
 10  c_charge_desc    5299 non-null   object
 11  score_text       5304 non-null   object
 12  decile_score     5304 non-null   int64 
 13  r_charge_degree  2122 non-null   object
 14  r_charge_desc    2095 non-null   object
 15  two_year_recid   5304 non-null   int64 
dtypes: int64(8), object(8)
memory usage: 704.4+ KB


In [6]:
# Check c_charge_desc for "arrest case no charge"

# df_clean = df_clean.dropna()
# df_clean[df_clean["c_charge_desc"].str.contains("arrest case no charge")]

In [7]:
# There are 358 unique charges in the dataset in total (wo NaN)
print(len(df_clean.c_charge_desc.unique()))

# There are 265 unqiue charges for felonies (23 of them also exist in misdemeanor) (wo NaN)
print(len(df_clean[df_clean["c_charge_degree"]=="F"][["c_charge_degree", "c_charge_desc"]].c_charge_desc.unique()))

# There are 125 unqiue charges for misdemeanors  (wo NaN)
print(len(df_clean[df_clean["c_charge_degree"]=="M"][["c_charge_degree", "c_charge_desc"]].c_charge_desc.unique()))

359
266
126


## Select relevant features

In [8]:
#Select features for Logistic regression model age or age_cat, exclude/ include race
columns_log = ["id", "age", "age_cat", "sex", "race",  #demographics
               "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", #criminal history
               "c_charge_degree", "c_charge_desc", #current charges 
               "two_year_recid"] #truth

df_log = df_clean[columns_log]

### Test

In [18]:
df_clean_test = df_clean.copy()

In [10]:
#df_log_test = df_log.copy()

In [11]:
lookup_c_charge_degree = pd.merge(df_clean[["c_charge_desc"]], 
                df_clean[["r_charge_degree", "r_charge_desc"]], 
                left_on="c_charge_desc", 
                right_on="r_charge_desc")

In [12]:
lookup_c_charge_degree = lookup_c_charge_degree[["c_charge_desc", "r_charge_degree"]].drop_duplicates()
lookup_c_charge_degree.columns = ["c_charge_desc", "c_charge_degree_detailed"]
lookup_c_charge_degree

Unnamed: 0,c_charge_desc,c_charge_degree_detailed
0,Felony Battery w/Prior Convict,(F3)
352,Possession of Cocaine,(F3)
367,Possession of Cocaine,(M1)
28240,Battery,(M1)
28346,Battery,(M2)
...,...,...
243338,Deliver Cannabis 1000FTSch,(F2)
243339,Sex Batt Faml/Cust Vict 12-17Y,(F1)
243340,Possession Of Clonazepam,(F3)
243342,Deliver Cocaine 1000FT School,(F1)


In [13]:
lookup_c_charge_degree = lookup_c_charge_degree.reset_index(drop=True)

In [14]:
lookup_c_charge_degree.groupby("c_charge_desc").filter(lambda x: len(x) > 1)

Unnamed: 0,c_charge_desc,c_charge_degree_detailed
1,Possession of Cocaine,(F3)
2,Possession of Cocaine,(M1)
3,Battery,(M1)
4,Battery,(M2)
12,Susp Drivers Lic 1st Offense,(M2)
13,Susp Drivers Lic 1st Offense,(M1)
29,False Ownership Info/Pawn Item,(F3)
30,False Ownership Info/Pawn Item,(F2)
36,Petit Theft,(M2)
37,Petit Theft,(M1)


In [15]:
#keeping the harder degrees (based on online research)
rows_to_drop = [2, 4, 12, 13, 29, 36, 43, 54, 66, 87, 89, 111]

lookup_c_charge_degree = lookup_c_charge_degree.drop(index=rows_to_drop)

In [16]:
lookup_c_charge_degree.set_index("c_charge_desc", inplace=True)
lookup_c_charge_degree = lookup_c_charge_degree[lookup_c_charge_degree.index.notnull()]

In [19]:
test = df_clean_test.merge(lookup_c_charge_degree, how="left", left_on="c_charge_desc", right_index=True)

In [20]:
charges_to_label = test[test["c_charge_degree_detailed"].isnull()]

In [21]:
charges_to_label_list = charges_to_label.groupby("c_charge_desc")[["id"]].count().sort_values(by="id", ascending=False)

In [22]:
charges_to_label_list["c_charge_degree_detailed"]=""

In [23]:
charges_to_label_list.columns = ['count', 'c_charge_degree_detailed']

In [None]:
#charges_to_label_list.to_csv("charges_to_label.csv")

In [24]:
charges_to_label_list.index

Index(['arrest case no charge', 'Felony Driving While Lic Suspd',
       'Susp Drivers Lic 1st Offense', 'Aggravated Assault w/Firearm',
       'Poss Contr Subst W/o Prescript', 'Cruelty Toward Child',
       'Leaving Acc/Unattended Veh', 'Burglary With Assault/battery',
       'Corrupt Public Servant', 'Purchase Cannabis',
       ...
       'False Bomb Report', 'False Name By Person Arrest',
       'Interference with Custody', 'Falsely Impersonating Officer',
       'Aiding Escape', 'Aide/Abet Prostitution Lewdness',
       'Grand Theft of a Fire Extinquisher', 'Hiring with Intent to Defraud',
       'Interfere W/Traf Cont Dev RR', 'Abuse Without Great Harm'],
      dtype='object', name='c_charge_desc', length=192)

In [32]:
# import labeled data
pd.read_csv("charges_labeled_c.csv")

Unnamed: 0,c_charge_desc,count,c_charge_degree_detailed
0,arrest case no charge,684.0,0
1,Felony Driving While Lic Suspd,73.0,F3
2,Susp Drivers Lic 1st Offense,45.0,M2
3,Aggravated Assault w/Firearm,20.0,F3
4,Poss Contr Subst W/o Prescript,15.0,F3
...,...,...,...
187,Aide/Abet Prostitution Lewdness,1.0,M1
188,Grand Theft of a Fire Extinquisher,1.0,F3
189,Hiring with Intent to Defraud,1.0,F3
190,Interfere W/Traf Cont Dev RR,1.0,M2


## Create pipeline

## Baseline model