In [83]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [84]:
train = pd.read_csv("/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/Processed/2010-2014/train/officer_profile.csv", low_memory=False)
test = pd.read_csv("/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/Processed/Testing/officer_profile.csv", low_memory=False)

In [85]:
train = train.drop(columns = ['OfficerFirst', 'OfficerLast'])
test = test.drop(columns = ['OfficerFirst', 'OfficerLast'])

In [86]:
train.columns, test.columns

(Index(['OfficerID', 'Gender', 'Race', 'ApptDate', 'Rank', 'Age', 'Beat',
        'historic_counts', 'prev_allegations', 'new_allegations'],
       dtype='object'),
 Index(['OfficerID', 'Gender', 'Race', 'ApptDate', 'Rank', 'Age', 'Beat',
        'historic_counts', 'prev_allegations', 'new_allegation'],
       dtype='object'))

In [87]:
# if beat is nan, remove row
print(train.shape)
train = train.dropna(subset=['Beat'])
print(train.shape)

(50218, 10)
(14021, 10)


In [88]:
print(test.shape)
test = test.dropna(subset=['Beat'])
print(test.shape)

(50218, 10)
(15380, 10)


In [89]:
# if there are nan values, drop the row
train = train.dropna()
test = test.dropna()

In [90]:
train["Beat"] = train["Beat"].astype(int)
test["Beat"] = test["Beat"].astype(int)

In [91]:
train["ApptDate"] = pd.to_datetime(train["ApptDate"])
test["ApptDate"] = pd.to_datetime(test["ApptDate"])

In [92]:
train.columns, test.columns

(Index(['OfficerID', 'Gender', 'Race', 'ApptDate', 'Rank', 'Age', 'Beat',
        'historic_counts', 'prev_allegations', 'new_allegations'],
       dtype='object'),
 Index(['OfficerID', 'Gender', 'Race', 'ApptDate', 'Rank', 'Age', 'Beat',
        'historic_counts', 'prev_allegations', 'new_allegation'],
       dtype='object'))

In [93]:
# change ApptDate to int
train["ApptDate"] = train["ApptDate"].astype(int)
test["ApptDate"] = test["ApptDate"].astype(int)

In [94]:
# change column name of new_allegation to new_allegations
test = test.rename(columns={"new_allegation": "new_allegations"})

In [95]:
# new_allegation is target
X_train = train.drop(columns = ['new_allegations'])
y_train = train['new_allegations']
X_test = test.drop(columns = ['new_allegations'])
y_test = test['new_allegations']

In [96]:
model = CatBoostRegressor(iterations=2, depth=2, learning_rate=1, loss_function='RMSE', cat_features=['Gender', 'Race', 'ApptDate', 'Beat', "Rank"])

In [97]:
model.fit(X_train, y_train)

0:	learn: 2.3836423	total: 3.21ms	remaining: 3.21ms
1:	learn: 2.3343250	total: 6ms	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1507132e0>

In [98]:
model.get_feature_importance()

array([ 0.        ,  0.58661328,  0.        ,  0.        ,  0.        ,
       17.3249927 ,  0.        ,  0.        , 82.08839403])

In [99]:
# get predicited results, compare to actual results
y_pred = model.predict(X_test)
# round to nearest integer
y_pred = np.round(y_pred)
mean_absolute_error(y_test, y_pred)

1.4763245792127841

In [100]:
# make y_pred and y_test into a dataframe
y_pred = pd.DataFrame(y_pred)
y_test = pd.DataFrame(y_test)
# combine y_pred and y_test into one dataframe
y_pred = y_pred.rename(columns={0: "y_pred"})
y_test = y_test.rename(columns={"new_allegations": "y_test"})
y_pred = y_pred.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
y_pred = pd.concat([y_pred, y_test], axis=1)
print(y_pred.shape)

(15269, 2)


In [101]:
print(y_pred.head(30))

    y_pred  y_test
0      0.0     0.0
1      0.0     0.0
2      4.0     0.0
3      0.0     0.0
4      1.0     1.0
5      1.0     0.0
6      1.0     0.0
7      0.0     0.0
8      1.0     0.0
9      1.0     0.0
10     4.0     1.0
11     1.0     0.0
12     4.0     0.0
13     1.0     0.0
14     0.0     0.0
15     1.0     0.0
16     0.0     0.0
17     0.0     0.0
18     1.0     1.0
19     0.0     0.0
20     1.0     0.0
21     0.0     1.0
22     1.0     0.0
23     0.0     0.0
24     0.0     0.0
25     1.0     0.0
26     0.0     0.0
27     1.0     0.0
28     2.0     0.0
29     4.0     1.0
