In [None]:
# restarting project with similar, but different dataset with more columns and more features

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('NYPD_Complaint_Data_Current__Year_To_Date_.csv', delimiter = ',')

In [3]:
df.head()

Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,SUSP_SEX,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
0,857927015,10.0,MANHATTAN,01/29/2019,16:37:00,01/29/2019,16:45:00,COMPLETED,,,...,M,,UNKNOWN,UNKNOWN,M,984140.0,211709.0,40.747777,-74.000398,"(40.747777093, -74.000398443)"
1,479254687,101.0,QUEENS,03/29/2019,17:00:00,03/29/2019,17:10:00,COMPLETED,,,...,M,,25-44,BLACK,F,1054076.0,157437.0,40.598538,-73.74856,"(40.598537593, -73.748559596)"
2,320007604,41.0,BRONX,02/06/2019,02:00:00,,,COMPLETED,,,...,M,,UNKNOWN,UNKNOWN,D,1011589.0,237996.0,40.819886,-73.901227,"(40.819885621, -73.901226998)"
3,746022144,68.0,BROOKLYN,01/08/2019,22:49:00,01/08/2019,22:52:00,COMPLETED,,,...,M,,UNKNOWN,UNKNOWN,E,984439.0,166855.0,40.624663,-73.999321,"(40.624663129, -73.999320591)"
4,593941718,45.0,BRONX,03/17/2019,05:00:00,03/17/2019,05:20:00,COMPLETED,,,...,M,,25-44,BLACK HISPANIC,F,1027255.0,244817.0,40.838545,-73.844583,"(40.838544913, -73.844582889)"


In [4]:
# looks like a lot of null values, let's do a count. 220k rows so drop if possible
df.isna().sum()

CMPLNT_NUM                0
ADDR_PCT_CD               5
BORO_NM                 138
CMPLNT_FR_DT              0
CMPLNT_FR_TM              0
CMPLNT_TO_DT          29166
CMPLNT_TO_TM          29060
CRM_ATPT_CPTD_CD          0
HADEVELOPT           211657
HOUSING_PSA          205789
JURISDICTION_CODE       137
JURIS_DESC                0
KY_CD                     0
LAW_CAT_CD                0
LOC_OF_OCCUR_DESC     40032
OFNS_DESC                 9
PARKS_NM             220758
PATROL_BORO             137
PD_CD                   137
PD_DESC                 137
PREM_TYP_DESC           937
RPT_DT                    0
STATION_NAME         216165
SUSP_AGE_GROUP        52382
SUSP_RACE             52382
SUSP_SEX              52382
TRANSIT_DISTRICT     216165
VIC_AGE_GROUP             0
VIC_RACE                  0
VIC_SEX                   0
X_COORD_CD               22
Y_COORD_CD               22
Latitude                 22
Longitude                22
Lat_Lon                  22
dtype: int64

In [5]:
# some of the columns just have majority nan values - lets drop those columns
df = df.drop(columns=['STATION_NAME', 'TRANSIT_DISTRICT', 'PARKS_NM', 'HOUSING_PSA', 'HADEVELOPT'])

In [6]:
df.isna().sum()

CMPLNT_NUM               0
ADDR_PCT_CD              5
BORO_NM                138
CMPLNT_FR_DT             0
CMPLNT_FR_TM             0
CMPLNT_TO_DT         29166
CMPLNT_TO_TM         29060
CRM_ATPT_CPTD_CD         0
JURISDICTION_CODE      137
JURIS_DESC               0
KY_CD                    0
LAW_CAT_CD               0
LOC_OF_OCCUR_DESC    40032
OFNS_DESC                9
PATROL_BORO            137
PD_CD                  137
PD_DESC                137
PREM_TYP_DESC          937
RPT_DT                   0
SUSP_AGE_GROUP       52382
SUSP_RACE            52382
SUSP_SEX             52382
VIC_AGE_GROUP            0
VIC_RACE                 0
VIC_SEX                  0
X_COORD_CD              22
Y_COORD_CD              22
Latitude                22
Longitude               22
Lat_Lon                 22
dtype: int64

In [7]:
# more manageable now, now just drop the rows with nulls
df = df.dropna()

In [9]:
df.shape
# still plenty of data

(121909, 30)

In [10]:
# i want the target to be CRM_ATPT_CPTD_CD - whether crime was completed, attempted, or interrupted. 
# lets take a look at the value counts to see if it's a good target
df['CRM_ATPT_CPTD_CD'].describe()

count        121909
unique            2
top       COMPLETED
freq         119755
Name: CRM_ATPT_CPTD_CD, dtype: object

In [11]:
df['CRM_ATPT_CPTD_CD'].value_counts()

COMPLETED    119755
ATTEMPTED      2154
Name: CRM_ATPT_CPTD_CD, dtype: int64

In [12]:
df['CRM_ATPT_CPTD_CD'].value_counts(normalize=True)

COMPLETED    0.982331
ATTEMPTED    0.017669
Name: CRM_ATPT_CPTD_CD, dtype: float64

In [None]:
# lets find a new target, majority baseline is .98
# new target to try BORO_NM, where the crime took place

In [14]:
df['BORO_NM'].describe()

count       121909
unique           5
top       BROOKLYN
freq         35755
Name: BORO_NM, dtype: object

In [15]:
df['BORO_NM'].value_counts()

BROOKLYN         35755
MANHATTAN        31872
QUEENS           25501
BRONX            24006
STATEN ISLAND     4775
Name: BORO_NM, dtype: int64

In [None]:
# i think this could be interesting, personally i've always wondered if certain boroughs are more dangerous than others

In [16]:
df['BORO_NM'].value_counts(normalize=True)

BROOKLYN         0.293293
MANHATTAN        0.261441
QUEENS           0.209181
BRONX            0.196917
STATEN ISLAND    0.039169
Name: BORO_NM, dtype: float64

In [None]:
# BORO_NM will be the target. will obviously have to drop columns such as longitude and any other related columns for leakage

In [17]:
y = df['BORO_NM']
X = df.drop(columns='BORO_NM')

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
y_train.value_counts(normalize=True)

BROOKLYN         0.293293
MANHATTAN        0.261435
QUEENS           0.209183
BRONX            0.196920
STATEN ISLAND    0.039169
Name: BORO_NM, dtype: float64

In [20]:
y_pred = ['BROOKLYN'] * len(y_test)

In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.29329013206463783

In [None]:
# majority baseline, brooklyn, .29329

In [22]:
train, test = train_test_split(df)

train.shape, test.shape

((91431, 30), (30478, 30))

In [23]:
import eli5
from eli5.sklearn import PermutationImportance

In [24]:
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['BORO_NM'], random_state=42)

In [25]:
target = 'BORO_NM'
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [27]:
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [28]:
transformers = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median')
)

X_train_transformed = transformers.fit_transform(X_train)
X_val_transformed = transformers.transform(X_val)

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [29]:
import eli5
from eli5.sklearn import PermutationImportance

permuter = PermutationImportance(
    model, 
    scoring='accuracy', 
    n_iter=5, 
    random_state=42
)

permuter.fit(X_val_transformed, y_val)

PermutationImportance(cv='prefit',
                      estimator=RandomForestClassifier(bootstrap=True,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,
                                                     

In [30]:
feature_names = X_val.columns.tolist()
pd.Series(permuter.feature_importances_, feature_names).sort_values(ascending=False)

ADDR_PCT_CD          0.251217
PATROL_BORO          0.009471
Longitude            0.001673
Latitude             0.000131
Y_COORD_CD           0.000109
Lat_Lon              0.000077
X_COORD_CD           0.000055
PREM_TYP_DESC        0.000044
JURISDICTION_CODE    0.000000
JURIS_DESC           0.000000
CMPLNT_TO_DT         0.000000
CRM_ATPT_CPTD_CD     0.000000
CMPLNT_TO_TM         0.000000
LAW_CAT_CD           0.000000
CMPLNT_FR_TM         0.000000
CMPLNT_FR_DT         0.000000
KY_CD                0.000000
PD_CD                0.000000
LOC_OF_OCCUR_DESC    0.000000
OFNS_DESC            0.000000
PD_DESC              0.000000
RPT_DT               0.000000
SUSP_AGE_GROUP       0.000000
SUSP_RACE            0.000000
SUSP_SEX             0.000000
VIC_AGE_GROUP        0.000000
VIC_RACE             0.000000
VIC_SEX              0.000000
CMPLNT_NUM           0.000000
dtype: float64

In [31]:
eli5.show_weights(
    permuter, 
    top=None, # show permutation importances for all features
    feature_names=feature_names
)

Weight,Feature
0.2512  ± 0.0039,ADDR_PCT_CD
0.0095  ± 0.0014,PATROL_BORO
0.0017  ± 0.0003,Longitude
0.0001  ± 0.0001,Latitude
0.0001  ± 0.0001,Y_COORD_CD
0.0001  ± 0.0001,Lat_Lon
0.0001  ± 0.0001,X_COORD_CD
0.0000  ± 0.0000,PREM_TYP_DESC
0  ± 0.0000,JURISDICTION_CODE
0  ± 0.0000,JURIS_DESC


In [32]:
# okay lets drop the map/location related columns. i want to see if certain crimes happen in certain boro's
df = df.drop(columns=['ADDR_PCT_CD', 'PATROL_BORO', 'Longitude', 'Latitude', 'Y_COORD_CD', 'Lat_Lon', 'X_COORD_CD'])

In [33]:
# rerun code
y = df['BORO_NM']
X = df.drop(columns='BORO_NM')

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [35]:
train, test = train_test_split(df)

In [36]:
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['BORO_NM'], random_state=42)

In [37]:
target = 'BORO_NM'
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [38]:
transformers = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median')
)

X_train_transformed = transformers.fit_transform(X_train)
X_val_transformed = transformers.transform(X_val)

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [39]:
import eli5
from eli5.sklearn import PermutationImportance

permuter = PermutationImportance(
    model, 
    scoring='accuracy', 
    n_iter=5, 
    random_state=42
)

permuter.fit(X_val_transformed, y_val)

PermutationImportance(cv='prefit',
                      estimator=RandomForestClassifier(bootstrap=True,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,
                                                     

In [40]:
feature_names = X_val.columns.tolist()
pd.Series(permuter.feature_importances_, feature_names).sort_values(ascending=False)

VIC_RACE             0.048056
PREM_TYP_DESC        0.044687
SUSP_RACE            0.027539
VIC_SEX              0.010652
LOC_OF_OCCUR_DESC    0.006759
OFNS_DESC            0.005720
PD_CD                0.004987
SUSP_AGE_GROUP       0.004145
JURIS_DESC           0.003084
PD_DESC              0.003051
CMPLNT_FR_TM         0.002439
SUSP_SEX             0.002144
KY_CD                0.001597
CMPLNT_TO_TM         0.001597
VIC_AGE_GROUP        0.000951
RPT_DT               0.000897
JURISDICTION_CODE    0.000711
CRM_ATPT_CPTD_CD    -0.000448
CMPLNT_NUM          -0.000733
LAW_CAT_CD          -0.001225
CMPLNT_TO_DT        -0.001619
CMPLNT_FR_DT        -0.001903
dtype: float64

In [41]:
eli5.show_weights(
    permuter, 
    top=None, # show permutation importances for all features
    feature_names=feature_names
)

Weight,Feature
0.0481  ± 0.0060,VIC_RACE
0.0447  ± 0.0018,PREM_TYP_DESC
0.0275  ± 0.0044,SUSP_RACE
0.0107  ± 0.0047,VIC_SEX
0.0068  ± 0.0019,LOC_OF_OCCUR_DESC
0.0057  ± 0.0037,OFNS_DESC
0.0050  ± 0.0024,PD_CD
0.0041  ± 0.0024,SUSP_AGE_GROUP
0.0031  ± 0.0020,JURIS_DESC
0.0031  ± 0.0037,PD_DESC


In [42]:
# okay, now let's try and fit a model with this new information, will try to avoid police codes

target = 'BORO_NM'
features = ['VIC_RACE', 'PREM_TYP_DESC', 'SUSP_RACE', 'VIC_SEX', 'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'SUSP_AGE_GROUP', 'CMPLNT_FR_TM', 'SUSP_SEX']

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [43]:
pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
)

pipeline.fit(X_train,y_train)
print('Validation Accuracy', pipeline.score(X_test, y_test))

Validation Accuracy 0.3660345167005709


In [None]:
# it's an improvement from the baseline, work on it further - moving on for now. think about potential feature engineering.

In [56]:
target = 'BORO_NM'
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [58]:
print('X_train shape', X_train.shape)
print('X_val shape', X_val.shape)
print('X_test shape', X_test.shape)

X_train shape (73144, 22)
X_val shape (18287, 22)
X_test shape (30478, 23)


In [57]:
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

processor = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median')
)

X_train_processed = processor.fit_transform(X_train)
X_val_processed = processor.transform(X_val)

eval_set = [(X_train_processed, y_train), 
            (X_val_processed, y_val)]

model = XGBClassifier(n_estimators=1000, n_jobs=-1)
model.fit(X_train_processed, y_train, eval_set=eval_set, eval_metric='auc', 
          early_stopping_rounds=10)

XGBoostError: [08:15:03] src/metric/rank_metric.cc:150: Check failed: preds.Size() == info.labels_.Size() (365720 vs. 73144) : label size predict size not match

In [52]:
row = X_test.iloc[[4]]

In [53]:
row

Unnamed: 0,VIC_RACE,PREM_TYP_DESC,SUSP_RACE,VIC_SEX,LOC_OF_OCCUR_DESC,OFNS_DESC,SUSP_AGE_GROUP,CMPLNT_FR_TM,SUSP_SEX
11976,WHITE,STREET,WHITE,F,FRONT OF,ASSAULT 3 & RELATED OFFENSES,45-64,09:05:00,F


In [54]:
import shap

explainer = shap.TreeExplainer(pipeline)
row_processed = processor.transform(row)
shap_values = explainer.shap_values(row_processed)

shap.initjs()
shap.force_plot(
    base_value=explainer.expected_value, 
    shap_values=shap_values, 
    features=row
)

Exception: Model type not yet supported by TreeExplainer: <class 'sklearn.pipeline.Pipeline'>