In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, log_loss, roc_auc_score

# Logistic regression (Classification of general recidivism)

## Load dataset

In [2]:
df = pd.read_csv("../../data-cleaned/compas-scores-two-years-clean.csv")

ProPublicas reasoning for filter `raw_data`:
>However not all of the rows are useable for the first round of analysis.
>There are a number of reasons remove rows because of missing data:
>* If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense.
>* We coded the recidivist flag -- `is_recid` -- to be -1 if we could not find a compas case at all.
>* In a similar vein, ordinary traffic offenses -- those with a `c_charge_degree` of 'O' -- will not result in Jail time are removed (only two of them).
>* We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.

## Filter dataset

In [3]:
df = df[
        (df["days_b_screening_arrest"]<=30) 
        & (df["days_b_screening_arrest"]>=-30) 
        & (df["is_recid"]!=-1) 
        & (df["c_charge_degree"]!="O") 
      ].dropna(subset=['score_text'])

In [4]:
# Select columns of interest and store as new df
columns = ["id", "age", "age_cat", "sex", "race",  #demographics
           "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", #criminal history
           "c_charge_degree", "c_charge_desc", #current charges
           "score_text", "decile_score", #compas results for current charges
           "r_charge_degree", "r_charge_desc", #commited crimes (general) within 2 years after COMPAS scoring 
           "two_year_recid"] #truth

df_clean = df[columns]

## Check values and dtypes of columns

In [5]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5304 entries, 0 to 6215
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               5304 non-null   int64 
 1   age              5304 non-null   int64 
 2   age_cat          5304 non-null   object
 3   sex              5304 non-null   object
 4   race             5304 non-null   object
 5   juv_fel_count    5304 non-null   int64 
 6   juv_misd_count   5304 non-null   int64 
 7   juv_other_count  5304 non-null   int64 
 8   priors_count     5304 non-null   int64 
 9   c_charge_degree  5304 non-null   object
 10  c_charge_desc    5299 non-null   object
 11  score_text       5304 non-null   object
 12  decile_score     5304 non-null   int64 
 13  r_charge_degree  2122 non-null   object
 14  r_charge_desc    2095 non-null   object
 15  two_year_recid   5304 non-null   int64 
dtypes: int64(8), object(8)
memory usage: 704.4+ KB


In [6]:
# Check c_charge_desc for "arrest case no charge"

# df_clean = df_clean.dropna()
# df_clean[df_clean["c_charge_desc"].str.contains("arrest case no charge")]

In [7]:
# There are 358 unique charges in the dataset in total (wo NaN)
print(len(df_clean.c_charge_desc.unique()))

# There are 265 unqiue charges for felonies (23 of them also exist in misdemeanor) (wo NaN)
print(len(df_clean[df_clean["c_charge_degree"]=="F"][["c_charge_degree", "c_charge_desc"]].c_charge_desc.unique()))

# There are 125 unqiue charges for misdemeanors  (wo NaN)
print(len(df_clean[df_clean["c_charge_degree"]=="M"][["c_charge_degree", "c_charge_desc"]].c_charge_desc.unique()))

359
266
126


## Select relevant features

In [8]:
#Select features for Logistic regression model age or age_cat, exclude/ include race
columns_log = ["id", "age", "age_cat", "sex", "race",  #demographics
               "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", #criminal history
               "c_charge_degree", "c_charge_desc", #current charges 
               "two_year_recid"] #truth

df_log = df_clean[columns_log]

### Feature Engineering: `c_charge_degree_detailed`

In [9]:
# check unique values/ classes of charges 
df_clean.r_charge_degree.unique()

array([nan, '(F3)', '(M1)', '(F2)', '(M2)', '(MO3)', '(F1)', '(F6)',
       '(F7)', '(CO3)', '(F5)'], dtype=object)

In [10]:
# Create lookup table for c_charge_degree_detailed based on c_charge_desc
lookup_c_charge_degree = pd.merge(
    df_clean[["c_charge_desc"]], 
    df_clean[
        (df_clean["r_charge_degree"]!='(MO3)') & #filter out classes of felonies and misdemeanors who do not exist
        (df_clean["r_charge_degree"]!='(F6)') &
        (df_clean["r_charge_degree"]!='(F7)') &
        (df_clean["r_charge_degree"]!='(CO3)') & 
        (df_clean["r_charge_degree"]!='(F5)') &
        (df_clean["r_charge_degree"]!=np.nan)
        ][["r_charge_degree", "r_charge_desc"]], 
    left_on="c_charge_desc", 
    right_on="r_charge_desc")

In [11]:
#drop duplicated c_charge_desc so we end up with unique charges and degrees
lookup_c_charge_degree = lookup_c_charge_degree[["c_charge_desc", "r_charge_degree"]].drop_duplicates()
lookup_c_charge_degree.columns = ["c_charge_desc", "c_charge_degree_detailed"]
lookup_c_charge_degree

Unnamed: 0,c_charge_desc,c_charge_degree_detailed
0,Felony Battery w/Prior Convict,(F3)
352,Possession of Cocaine,(F3)
367,Possession of Cocaine,(M1)
28240,Battery,(M1)
28346,Battery,(M2)
...,...,...
243290,Deliver Cannabis 1000FTSch,(F2)
243291,Sex Batt Faml/Cust Vict 12-17Y,(F1)
243292,Possession Of Clonazepam,(F3)
243294,Deliver Cocaine 1000FT School,(F1)


In [12]:
#reset index 
lookup_c_charge_degree = lookup_c_charge_degree.reset_index(drop=True)

In [13]:
#Find charges which do have multiple charge_degrees
lookup_c_charge_degree.groupby("c_charge_desc").filter(lambda x: len(x) > 1)

Unnamed: 0,c_charge_desc,c_charge_degree_detailed
1,Possession of Cocaine,(F3)
2,Possession of Cocaine,(M1)
3,Battery,(M1)
4,Battery,(M2)
12,Susp Drivers Lic 1st Offense,(M2)
13,Susp Drivers Lic 1st Offense,(M1)
29,False Ownership Info/Pawn Item,(F3)
30,False Ownership Info/Pawn Item,(F2)
36,Petit Theft,(M2)
37,Petit Theft,(M1)


In [14]:
#Decide for one degree for each duplicate -> we keep the harder class degrees (based on online research)
rows_to_drop = [2, 4, 12, 13, 29, 36, 43, 53, 65, 88]

#drop rows based on index 
lookup_c_charge_degree = lookup_c_charge_degree.drop(index=rows_to_drop)

In [15]:
#check that there are no more duplicates 
lookup_c_charge_degree.groupby("c_charge_desc").filter(lambda x: len(x) > 1)

Unnamed: 0,c_charge_desc,c_charge_degree_detailed


In [16]:
#set index to c_charge_desc and drop row with c_charge_desc = NaN
lookup_c_charge_degree.set_index("c_charge_desc", inplace=True)
lookup_c_charge_degree = lookup_c_charge_degree[lookup_c_charge_degree.index.notnull()]

#Remove parenthesis from the labels 
lookup_c_charge_degree["c_charge_degree_detailed"] = lookup_c_charge_degree.c_charge_degree_detailed.str.strip("( )")

### Create list for charges which could not be matched automatically and need to be labeled manually

In [18]:
#Use lookup table to label charges based on description
charges_to_label = df_clean.merge(lookup_c_charge_degree, how="left", left_on="c_charge_desc", right_index=True)

In [19]:
#Filter for rows which could not be labeled by the current lookup table (because they did not exist in r_charge_desc)
charges_to_label = charges_to_label[charges_to_label["c_charge_degree_detailed"].isnull()]

In [20]:
# Create list of unique charges which could not be labeled. These charges need to be labeled manually (online research)
charges_to_label_list = charges_to_label.groupby("c_charge_desc")[["id"]].count().sort_values(by="id", ascending=False)
charges_to_label_list["c_charge_degree_detailed"]=""
charges_to_label_list.columns = ['count', 'c_charge_degree_detailed']

charges_to_label_list

Unnamed: 0_level_0,count,c_charge_degree_detailed
c_charge_desc,Unnamed: 1_level_1,Unnamed: 2_level_1
arrest case no charge,684,
Felony Driving While Lic Suspd,73,
Susp Drivers Lic 1st Offense,45,
Aggravated Assault w/Firearm,20,
Poss Contr Subst W/o Prescript,15,
...,...,...
False Name By Person Arrest,1,
Falsely Impersonating Officer,1,
Grand Theft of a Fire Extinquisher,1,
Hiring with Intent to Defraud,1,


In [21]:
# Export to .csv file so the charges can be labeled manually (only has to be executed once)
# charges_to_label_list.to_csv("charges_to_label.csv")

### Import manually labeld .csv file and combine with `lookup_c_charge_degree` to create master lookup table

In [22]:
# import labeled data
labeled_charges_man = pd.read_csv("charges_labeled_man.csv", index_col=0).set_index("c_charge_desc")[["c_charge_degree_detailed"]]

In [23]:
#Create master lookup table by combining manually labeled data with automatically labeled data
lookup_c_charge_degree_master = pd.concat([lookup_c_charge_degree, labeled_charges_man], axis=0)

In [25]:
# Save master lookup table as .csv so it can be imported directly (only has to be executed once)
#lookup_c_charge_degree_master.to_csv("labeled_charges_master.csv")

In [32]:
# Load master lookup table
labeled_charges_master = pd.read_csv("labeled_charges_master.csv", index_col="c_charge_desc")

In [33]:
# Map charge degree to all cases and store in new column "c_charge_degree_detailed"
df_clean = df_clean.merge(labeled_charges_master, how="left", left_on="c_charge_desc", right_index=True)

In [34]:
#Check NaN values (values which did not have a c_charge_desc in the original dataset)
df_clean[df_clean.c_charge_degree_detailed.isnull()]

Unnamed: 0,id,age,age_cat,sex,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,c_charge_desc,score_text,decile_score,r_charge_degree,r_charge_desc,two_year_recid,c_charge_degree_detailed
133,243,39,25 - 45,Male,Caucasian,1,0,0,1,M,,Low,4,,,0,
2572,4521,27,25 - 45,Female,Caucasian,0,0,0,0,F,,Medium,5,,,0,
4719,8299,27,25 - 45,Male,African-American,0,0,0,2,F,,Low,4,,,0,
4924,8679,27,25 - 45,Male,African-American,0,0,0,2,F,,Low,3,,,0,
6082,10737,69,Greater than 45,Female,Caucasian,0,0,0,5,F,,Low,3,,,0,


### Ordinal Encoding of  `c_charge_degree_detailed`

In [39]:
from sklearn.preprocessing import OrdinalEncoder

In [40]:
#Check unique categories in charge_degree_detailed
df_clean.c_charge_degree_detailed.unique()

array(['F3', 'M1', '0', 'F2', 'M2', nan, 'F1'], dtype=object)

In [154]:
# define order for ordinal encoding (least drastic to most drastic charge)
ord_c_charge_detail = [["0","M2","M1","F3","F2","F1"]]

In [167]:
encoder_ordinal = OrdinalEncoder(
    categories=ord_c_charge_detail,
    dtype= np.int64,
    handle_unknown="use_encoded_value",
    unknown_value=-1 # Considers unknown values as worse than "0"
)

In [173]:
test = df_clean.copy()

In [174]:
test["c_charge_degree_detailed_enc"] = encoder_ordinal.fit_transform(test[["c_charge_degree_detailed"]])

In [175]:
test

Unnamed: 0,id,age,age_cat,sex,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,c_charge_desc,score_text,decile_score,r_charge_degree,r_charge_desc,two_year_recid,c_charge_degree_detailed,c_charge_degree_detailed_enc
0,1,69,Greater than 45,Male,Other,0,0,0,0,F,Aggravated Assault w/Firearm,Low,1,,,0,F3,3
1,3,34,25 - 45,Male,African-American,0,0,0,0,F,Felony Battery w/Prior Convict,Low,3,(F3),Felony Battery (Dom Strang),1,F3,3
2,4,24,Less than 25,Male,African-American,0,0,1,4,F,Possession of Cocaine,Low,4,(M1),Driving Under The Influence,1,F3,3
5,7,44,25 - 45,Male,Other,0,0,0,0,M,Battery,Low,1,,,0,M1,2
6,8,41,25 - 45,Male,Caucasian,0,0,0,14,F,Possession Burglary Tools,Medium,6,(F2),Poss of Firearm by Convic Felo,1,F3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6211,10995,20,Less than 25,Male,African-American,0,0,0,0,F,Possession of Cocaine,High,9,,,0,F3,3
6212,10996,23,Less than 25,Male,African-American,0,0,0,0,F,Deliver Cannabis,Medium,7,,,0,F3,3
6213,10997,23,Less than 25,Male,African-American,0,0,0,0,F,Leaving the Scene of Accident,Low,3,,,0,F3,3
6214,10999,57,Greater than 45,Male,Other,0,0,0,0,F,Aggravated Battery / Pregnant,Low,1,,,0,F2,4


### KNN Impute missing c_charge_desc (did not work, so probably drop them)

In [202]:
test_2 = test[['age', 'age_cat', 'sex', 'race', 'juv_fel_count','juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree', "c_charge_degree_detailed", "c_charge_degree_detailed_enc"]].copy()

In [204]:
# Impute missing values (5 cases)
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(missing_values=-1,n_neighbors=10)

test_2["c_charge_degree_detailed_enc_imp"]=knn_imputer.fit_transform(test_2[["c_charge_degree_detailed_enc"]])


In [212]:
test_2 = test_2.reset_index(drop=True)

In [213]:
test_2[test_2["c_charge_degree_detailed"].isna()]

Unnamed: 0,age,age_cat,sex,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,c_charge_degree_detailed,c_charge_degree_detailed_enc,c_charge_degree_detailed_enc_imp
114,39,25 - 45,Male,Caucasian,1,0,0,1,M,,-1,2.304397
2191,27,25 - 45,Female,Caucasian,0,0,0,0,F,,-1,2.304397
4031,27,25 - 45,Male,African-American,0,0,0,2,F,,-1,2.304397
4202,27,25 - 45,Male,African-American,0,0,0,2,F,,-1,2.304397
5183,69,Greater than 45,Female,Caucasian,0,0,0,5,F,,-1,2.304397


## Create pipeline

## Baseline model