In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import time

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

In [272]:
import re
import random

In [4]:
# import scipy.stats

# import matplotlib.pyplot as plt
# %matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.base import BaseEstimator, ClassifierMixin
# from sklearn.feature_selection import RFE, RFECV

In [5]:
from sklearn.dummy import DummyClassifier, DummyRegressor

from sklearn.ensemble import RandomForestClassifier#, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet, SGDClassifier, SGDRegressor

from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

In [6]:
# Bizzare problem: It seems that when you import one of lbgm, xgboost, Ridge, Lasso, ElasticNet, SGDClassifier, SGDRegressor, 
# if the below options are enabled, it will wreck any operation on a dataframe (including h df.head(), causing it to hang indefinitely

# pd.options.display.max_rows = None  # to stop pandas from not displaying all columns because of screen width
# pd.options.display.max_columns = None  # to stop pandas from not displaying all columns because of screen width
# pd.options.display.max_colwidth = 100  # To prevent pandas from concatenating very long columns. Set to 0.

In [7]:
sentencing_processed = pd.read_csv("Sentencing_processed_data.csv",
                                  parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
                                                 "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
                                                 "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"],
                                  index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [67]:
sentencing = sentencing_processed.copy()

In [68]:
# Each crime may have multiple charges, so must ensure that two cases from the same crime do not end up in different tain/test splits
# because this will lead to data leaking to the test set.

# Note that during cross validation, I ignore this because it is too much trouble to do the splits properly, so the train scores will likely be inflated
# compared to the test scores, but since the test split is done correctly, the test score will be valid.

# Technically, I shouldn't need to do this if my understanding of how sentencing works (each case gets an independent sentence),
# but doing this to be on the safe side.

# sentencing_sorted = sentencing.sort_values(by=["CASE_ID"])
# random.seed(a=123, version=2)  # for reproduceability
# df_train = pd.DataFrame()
# df_test = pd.DataFrame()
# split_ratio = 0.2

# # estimated to take exactly 1 hour on 233k examples
# previous_id = 0
# last_set = 0; # "train" or "test"

# total_length = len(sentencing_sorted)

# for i in range(len(sentencing_sorted)):
#     if (i%100 == 0): print(i, time.time())
#     curr_line = sentencing_sorted.iloc[i]
#     if curr_line["CASE_ID"] == previous_id:
#         df_train.append(curr_line) if (last_set=="train") else df_test.append(curr_line)
#     else:
#         # sample random number to decide which dataset
#         if (random.random() < split_ratio):
#             last_set = "train"
#             df_train.append(curr_line)
#         else:
#             last_set = "test"
#             df_test.append(curr_line)
            
# print("Done split")

In [217]:
# Time encoding for "ARREST_DATE"
sentencing["month"] = sentencing["ARREST_DATE"].apply(lambda x: x.month)

# sin/cos for seasonality
sentencing["month_sin"] = np.sin(2*np.pi*sentencing["month"]/12)
sentencing["month_cos"] = np.cos(2*np.pi*sentencing["month"]/12)

# linear encoding
min_date = min(sentencing["ARREST_DATE"])
sentencing["days_number"] = (sentencing["ARREST_DATE"] - min_date).dt.days

In [218]:
df_train_1, df_test_1 = train_test_split(sentencing, test_size=0.2, random_state=123)
print(len(df_test_1) / (len(df_test_1) + len(df_train_1)))

0.2


In [219]:
len(df_train_1)

184852

In [220]:
y_train_type = df_train_1["categorical_sentence"]  # for categorical_sentence
y_train_length = df_train_1["sentence_period_years"]  # for sentence_period_years

y_test_type = df_test_1["categorical_sentence"]
y_test_length = df_test_1["sentence_period_years"]

In [221]:
# categorize features for preprocessing

# includes all features that are an outcome of the judicial process and not the crime except for any features
# that are used to classify the crime (since I have no other way of knowing what the crime is). 
drop_features = ["CASE_ID", "CASE_PARTICIPANT_ID", "CHARGE_ID", "CHARGE_VERSION_ID", "LENGTH_OF_CASE_in_Days", "SENTENCE_PHASE",
                "SENTENCE_TYPE", "COMMITMENT_TYPE", "CURRENT_SENTENCE", "SENTENCE_JUDGE",
                "CHARGE_DISPOSITION_REASON", "COURT_NAME", "COURT_FACILITY", "RECEIVED_DATE",
                "DISPOSITION_DATE", "SENTENCE_DATE", "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE", "ARRAIGNMENT_DATE",
                "ARREST_DATE", "month"]
# drop length of case since this is information from after sentencing
# Drop sentence/commitment type since it has been merged into categorical_sentence
# CHARGE_DISPOSITION_REASON - too many missing features
# ARREST_DATE is dropped because we use the processed date features. month is a intermediary.

numeric_features = ["AGE_AT_INCIDENT", "month_sin", "month_cos", "days_number", "CHARGE_COUNT"]

# features to be one-hot encoded
categorical_features = ["OFFENSE_CATEGORY", "DISPOSITION_CHARGED_OFFENSE_TITLE", "CHARGE_DISPOSITION",
                        "GENDER", "RACE", "UPDATED_OFFENSE_CATEGORY",
                        "DISPOSITION_CHARGED_CHAPTER", "DISPOSITION_CHARGED_ACT", "DISPOSITION_CHARGED_SECTION",
                        "DISPOSITION_CHARGED_CLASS", "INCIDENT_CITY", "LAW_ENFORCEMENT_AGENCY", "UNIT",
                        "DISPOSITION_CHARGED_AOIC", "PRIMARY_CHARGE"] #use one-hot encoding with drop first
# UNIT is department of police force which is involved
# AOIC refers to Administrative Office of the Illinois Courts ID
# PRIMARY_CHARGE is boolean

# ordinal encoding
ordinal_features = []


# what we are predicting (y)
target_raw = ["COMMITMENT_TERM", "COMMITMENT_UNIT"] # raw target; will be dropped
target_processed = ["categorical_sentence", "sentence_period_years"]

drop_features = drop_features + target_raw + target_processed

In [222]:
temp_a  = list(drop_features + numeric_features + categorical_features + ordinal_features)
temp_a.sort()

temp_b = list(sentencing.columns)
temp_b.sort()

assert (temp_a == temp_b), "Columns do not match"

In [223]:
# Drop target columns - skip if running again
# df_train = df_train.drop(columns=target_raw+target_processed, axis=1, errors='ignore')
# df_test = df_test.drop(columns=target_raw+target_processed, axis=1, errors='ignore')

df_train = df_train_1[numeric_features+categorical_features]
df_test = df_test_1[numeric_features+categorical_features]


df_train[numeric_features] = df_train[numeric_features].astype('float')  # ensure all numeric fields are float
# df_train["PRIMARY_CHARGE"] = df_train["PRIMARY_CHARGE"].astype(str)  # convert boolean to string
df_train[categorical_features] = df_train[categorical_features].astype(str)  # ensure no floats

df_test[numeric_features] = df_test[numeric_features].astype('float')
# df_test["PRIMARY_CHARGE"] = df_test["PRIMARY_CHARGE"].astype(str)
df_test[categorical_features] = df_test[categorical_features].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [224]:
# Note: The code above will lead to missing data for predicting the sentence duration as you should know what kind of sentence is given.
# Ignoring for now, but this means that I should not predict duration without fixing this

In [225]:
# do nothing estimator; https://scikit-learn.org/stable/developers/develop.html
class Nothing(BaseEstimator, ClassifierMixin):

    def __init__(self, demo_param='demo'):
        self.demo_param = demo_param

    def fit(self, X, y):
        # Do nothing
        print()
        return self

    def predict(self, X):
        # Do nothing
        return None

    def transform(self, data):
        # return data without doing anything
        return data

In [226]:
passthrough_transformer = Pipeline([
    ('do_nothing', Nothing())
])

In [227]:
categorical_transformer_cat = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value='?')),
])

categorical_transformer_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value='?')),
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [228]:
numeric_transformer_cat = Pipeline([
    ('imputer', SimpleImputer(strategy='median', missing_values=np.nan)),
    ('scaler', StandardScaler())
])

numeric_transformer_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='median', missing_values=np.nan)),
    ('scaler', StandardScaler())
])

In [229]:
preprocessor_cat = ColumnTransformer([
    ('numeric', numeric_transformer_cat, numeric_features),
    ('categorical', categorical_transformer_cat, categorical_features)
], remainder='drop')

preprocessor_ohe = ColumnTransformer([
    ('numeric', numeric_transformer_ohe, numeric_features),
    ('categorical', categorical_transformer_ohe, categorical_features)
], remainder='drop')

In [233]:
df_train.isna().sum()

AGE_AT_INCIDENT                      2383
month_sin                            3752
month_cos                            3752
days_number                          3752
CHARGE_COUNT                            0
OFFENSE_CATEGORY                        0
DISPOSITION_CHARGED_OFFENSE_TITLE       0
CHARGE_DISPOSITION                      0
GENDER                                  0
RACE                                    0
UPDATED_OFFENSE_CATEGORY                0
DISPOSITION_CHARGED_CHAPTER             0
DISPOSITION_CHARGED_ACT                 0
DISPOSITION_CHARGED_SECTION             0
DISPOSITION_CHARGED_CLASS               0
INCIDENT_CITY                           0
LAW_ENFORCEMENT_AGENCY                  0
UNIT                                    0
DISPOSITION_CHARGED_AOIC                0
PRIMARY_CHARGE                          0
dtype: int64

In [234]:
preprocessor_ohe.fit(df_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('numeric',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                       

In [235]:
preprocessor_cat.fit(df_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('numeric',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                       

In [236]:
preprocessor_cat.fit(df_train)
preprocessor_ohe.fit(df_train)

ohe = preprocessor_ohe.named_transformers_['categorical'].named_steps['onehot']
ohe_feature_names = list(ohe.get_feature_names(categorical_features))

new_columns_cat = numeric_features + categorical_features
new_columns_ohe = numeric_features + ohe_feature_names

In [288]:
X_train_cat = pd.DataFrame(preprocessor_cat.transform(df_train), index=df_train.index, columns=new_columns_cat)
X_test_cat  = pd.DataFrame(preprocessor_cat.transform(df_test), index=df_test.index,  columns=new_columns_cat)

X_train_ohe = pd.DataFrame(preprocessor_ohe.transform(df_train), index=df_train.index, columns=new_columns_ohe)
X_test_ohe  = pd.DataFrame(preprocessor_ohe.transform(df_test), index=df_test.index,  columns=new_columns_ohe)

regex = re.compile(r"\[|\]|<", re.IGNORECASE)
# replace any [, ], < in feature name since XGBoost has problems with it
X_train_ohe.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train_ohe.columns.values]
X_test_ohe.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_test_ohe.columns.values]
# replace special characters since LightBGM has problems otherwise - this has problems with creating non unique features
# X_train_ohe.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train_ohe.columns]
# X_test_ohe.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test_ohe.columns]

- Note that I am not removing DISPOSITION_CHARGED_OFFENSE_TITLE and UPDATED_OFFENSE_CATEGORY, both of which are open to interpretation and therefore depend on the judgement to some extent.
- Try dropping all but Primary charge (in case the non primary charges have the same sentence, which would be out of proportion for smaller charges grouped together with more serious charges.
- Also leaving CHARGE_DISPOSITION. All cases in this dataset were convicted but the sentence might depend on whether the defendant plead guilty or not.


In [289]:
modelDC = DummyClassifier(strategy="most_frequent")

In [304]:
modelLR = LogisticRegression(max_iter=1000)

In [291]:
modelXG = XGBClassifier()

In [292]:
modelLB = LGBMClassifier()

In [293]:
modelCB = CatBoostClassifier(cat_features=categorical_features)

Run Time with n=100
- LogisticRegression - 0.5s (1)
- XGBoost - 10s (20)
- CatBoost - 90s (180)


- Logistic regression n=10000 - 80s

In [307]:
n=10000

print("DummyClassifier")
%timeit -n1 -r1 modelDC.fit(X_train_ohe.head(n), y_train_type.head(n))

print("\nLogisticRegression")
%timeit -n1 -r1 modelLR.fit(X_train_ohe.head(n), y_train_type.head(n))

# print("\nXGBoost")
# %timeit -n1 -r1 modelXG.fit(X_train_ohe.head(n), y_train_type.head(n))

# print("\nCatBoost")
# %timeit -n1 -r1 modelCB.fit(X_train_cat.head(n), y_train_type.head(n), verbose=1000)


# Has problems with the feature names
# print("\nLightBGM")
# %timeit -n1 -r1 modelLB.fit(X_train_ohe.head(n), y_train_type.head(n))

DummyClassifier
8.31 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

LogisticRegression
1min 21s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [308]:
print("DummyClassifier")
print(f"Train: {modelDC.score(X_train_ohe.head(n), y_train_type.head(n))}")
print(f"Test: {modelDC.score(X_test_ohe.head(n), y_test_type.head(n))}")

print("\nLogisticRegression")
print(f"Train: {modelLR.score(X_train_ohe.head(n), y_train_type.head(n))}")
print(f"Test: {modelLR.score(X_test_ohe.head(n), y_test_type.head(n))}")

# print("\nXGBoost")
# print(f"Train: {modelXG.score(X_train_ohe.head(n), y_train_type.head(n))}")
# print(f"Test: {modelXG.score(X_test_ohe.head(n), y_test_type.head(n))}")

# print("\nCatBoost")
# print(f"Train: {modelCB.score(X_train_cat.head(n), y_train_type.head(n))}")
# print(f"Test: {modelCB.score(X_test_cat.head(n), y_test_type.head(n))}")


# print("\LightBGM")
# print(f"Train: {modelLB.score(X_train_ohe.head(n), y_train_type.head(n))}")
# print(f"Test: {modelLB.score(X_test_ohe.head(n), y_test_type.head(n))}")

DummyClassifier
Train: 0.5736
Test: 0.5771

LogisticRegression
Train: 0.7249
Test: 0.6601


In [None]:
DummyClassifi

In [389]:
X_train.dtypes

AGE_AT_INCIDENT                      object
month_sin                            object
month_cos                            object
days_number                          object
CHARGE_COUNT                         object
OFFENSE_CATEGORY                     object
DISPOSITION_CHARGED_OFFENSE_TITLE    object
CHARGE_DISPOSITION                   object
GENDER                               object
RACE                                 object
UPDATED_OFFENSE_CATEGORY             object
DISPOSITION_CHARGED_CHAPTER          object
DISPOSITION_CHARGED_ACT              object
DISPOSITION_CHARGED_SECTION          object
DISPOSITION_CHARGED_CLASS            object
INCIDENT_CITY                        object
LAW_ENFORCEMENT_AGENCY               object
UNIT                                 object
DISPOSITION_CHARGED_AOIC             object
PRIMARY_CHARGE                       object
dtype: object

In [390]:
X_train.head(1)

Unnamed: 0,AGE_AT_INCIDENT,month_sin,month_cos,days_number,CHARGE_COUNT,OFFENSE_CATEGORY,DISPOSITION_CHARGED_OFFENSE_TITLE,CHARGE_DISPOSITION,GENDER,RACE,UPDATED_OFFENSE_CATEGORY,DISPOSITION_CHARGED_CHAPTER,DISPOSITION_CHARGED_ACT,DISPOSITION_CHARGED_SECTION,DISPOSITION_CHARGED_CLASS,INCIDENT_CITY,LAW_ENFORCEMENT_AGENCY,UNIT,DISPOSITION_CHARGED_AOIC,PRIMARY_CHARGE
56210,0.489372,0.738924,1.2862,-0.75152,-0.252813,Narcotics,POSSESSION OF A CONTROLLED SUBSTANCE,Plea Of Guilty,Male,Black,Narcotics,720,570,402(c),4,Chicago,CHICAGO PD,District 10 - Ogden,5101110,True


Learning rate set to 0.07036
0:	learn: 1.5486981	total: 70.5ms	remaining: 1m 10s
100:	learn: 0.4962509	total: 6.51s	remaining: 58s
200:	learn: 0.2588810	total: 15s	remaining: 59.7s
300:	learn: 0.1577205	total: 23.6s	remaining: 54.8s
400:	learn: 0.1078402	total: 32.2s	remaining: 48.1s
500:	learn: 0.0821076	total: 40.8s	remaining: 40.6s
600:	learn: 0.0647056	total: 49.7s	remaining: 33s
700:	learn: 0.0533171	total: 58.6s	remaining: 25s
800:	learn: 0.0451912	total: 1m 8s	remaining: 17s
900:	learn: 0.0388740	total: 1m 17s	remaining: 8.54s
999:	learn: 0.0344301	total: 1m 26s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1cf31d5c128>

0.68

0.91