In [412]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [413]:
train = pd.read_csv("/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/Processed/2010-2014/train/officer_profile.csv", low_memory=False)
test = pd.read_csv("/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/Processed/Testing/officer_profile.csv", low_memory=False)

In [414]:
train = train.drop(columns = ['OfficerFirst', 'OfficerLast', 'ApptDate'])
test = test.drop(columns = ['OfficerFirst', 'OfficerLast', 'ApptDate'])

In [415]:
train.columns, test.columns

(Index(['OfficerID', 'Gender', 'Race', 'Rank', 'Age', 'Beat', 'historic_counts',
        'prev_allegations', 'new_allegations', 'ApptYear', 'YearsInForce'],
       dtype='object'),
 Index(['OfficerID', 'Gender', 'Race', 'Rank', 'Age', 'Beat', 'historic_counts',
        'prev_allegations', 'new_allegations', 'ApptYear', 'YearsInForce'],
       dtype='object'))

In [416]:
# if beat is nan, remove row
print(train.shape)
train = train.dropna(subset=['Beat'])
print(train.shape)

(50218, 11)
(14021, 11)


In [417]:
print(test.shape)
test = test.dropna(subset=['Beat'])
print(test.shape)

(50218, 11)
(15380, 11)


In [418]:
# if there are nan values, drop the row
train = train.dropna()
test = test.dropna()

In [419]:
train["Beat"] = train["Beat"].astype(int)
test["Beat"] = test["Beat"].astype(int)

In [420]:
train.columns, test.columns

(Index(['OfficerID', 'Gender', 'Race', 'Rank', 'Age', 'Beat', 'historic_counts',
        'prev_allegations', 'new_allegations', 'ApptYear', 'YearsInForce'],
       dtype='object'),
 Index(['OfficerID', 'Gender', 'Race', 'Rank', 'Age', 'Beat', 'historic_counts',
        'prev_allegations', 'new_allegations', 'ApptYear', 'YearsInForce'],
       dtype='object'))

In [421]:
print(train.head())

   OfficerID Gender           Race             Rank  Age  Beat  \
0          4      M          White  Po As Detective   80   321   
1         13      M          White   Police Officer   69  1511   
2         17      M          Black   Police Officer   64  2525   
3         19      M  Asian/Pacific   Police Officer   73  1122   
4         34      M          Black   Police Officer   49  1212   

   historic_counts  prev_allegations  new_allegations  ApptYear  YearsInForce  
0               71                 2                0    1969.0          41.0  
1               75                 2                1    1982.0          28.0  
2              260                25                0    1989.0          21.0  
3                2                 4                1    1999.0          11.0  
4                3                11                1    1998.0          12.0  


In [422]:
# change column name of new_allegation to new_allegations
test = test.rename(columns={"new_allegation": "new_allegations"})

In [423]:
# new_allegation is target
X_train = train.drop(columns = ['new_allegations'])
y_train = train['new_allegations']
X_test = test.drop(columns = ['new_allegations'])
y_test = test['new_allegations']

In [424]:
model = CatBoostRegressor(iterations=2, depth=2, learning_rate=1, loss_function='RMSE', cat_features=['Gender', 'Race', 'Beat', "Rank"])

In [425]:
model.fit(X_train, y_train)

0:	learn: 2.3755760	total: 1.83ms	remaining: 1.83ms
1:	learn: 2.3183029	total: 3.36ms	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1473fbb50>

In [426]:
model.get_feature_importance()

array([ 1.96469004,  0.        ,  0.        ,  0.        , 18.05545158,
        0.        ,  0.        , 79.97985838,  0.        ,  0.        ])

In [427]:
# get predicited results, compare to actual results
y_pred = model.predict(X_test)
# round to nearest integer
y_pred = np.round(y_pred)
mean_absolute_error(y_test, y_pred)

1.4415482349859192

In [428]:
# make y_pred and y_test into a dataframe
predictions = pd.DataFrame(y_pred)
actual = pd.DataFrame(y_test)
# combine y_pred and y_test into one dataframe
predictions = pd.concat([predictions, actual], axis=1)
predictions.columns = ['Predicted', 'Actual']
print(y_pred.shape)

(15269,)


In [429]:
# drop NaN values
predictions = predictions.dropna()

In [430]:
# get top 1% of actual values
top_1 = predictions.nlargest(int(predictions.shape[0] * 0.01), 'Actual')
top_1

Unnamed: 0,Predicted,Actual
6537,1.0,9.0
12290,1.0,9.0
3584,1.0,8.0
4257,4.0,8.0
5193,1.0,8.0
...,...,...
194,3.0,3.0
237,1.0,3.0
244,1.0,3.0
248,1.0,3.0


In [431]:
# get mean squared error of this top 1%
mean_absolute_error(top_1['Actual'], top_1['Predicted'])

3.0816326530612246