In [43]:
# dependencies
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# support methods

In [25]:
# main
dpa = pd.read_parquet("../../export/output/complaints.parquet")

basecols = ['allegation_id', 'allegations', 'finding',]
# metadata about the complaint/allegation
meta = ['report_type', 'dpa_added', 'occ_added']
# phrases that suggest something about the investigation into the allegation
proc = ['no_officer_id', 'default_finding', 'withdrawn', 'jlp', 'bwc', 'mediated']
# potentials values for use of force allegation: CA P.C. 835, SFPD G.O. 5.01
kw_g1 = ['intimidation', 'racial_bias', 'resisting', 'force', 'pursuit', 'swat', 'firearm', 'taser', 'crisis']
kw_g2 = ['home', 'minor',]

# subset the data with mostly-cleaned allegations
dpa['uof_kw'] = dpa[kw_g1].apply(lambda x: any(x.values), axis=1)
picky = dpa.loc[dpa.finding.isin(dpa.finding.value_counts().head(5).index)
    ].dropna(subset='allegations').sample(10000)

In [26]:
train, test = train_test_split(
    picky,
    test_size=.4, train_size=.6,
    random_state=42, shuffle=True)

In [27]:
colgroup = basecols + ['dpa_added', 'uof_kw']

In [28]:
train[colgroup]

Unnamed: 0,allegation_id,allegations,finding,dpa_added,uof_kw
14218,7efd8ce74c9fa2f5,The officer conducted an incomplete investigat...,NS,False,False
26636,ed952af0588c7542,The officer failed to take required action.,Proper Conduct,False,False
24535,dbdd02473f97e705,The officer failed to take an Incident Report.,NS,False,False
7833,46197c25880d1fd2,The officer made inappropriate comments and be...,NS,False,False
1558,0e2a4470ada6f060,The officer handcuffed the complainant without...,Proper Conduct,False,False
...,...,...,...,...,...
15047,86152e6ce1a97413,The officer harassed the complainant’s son.,Unfounded,False,False
24927,df27de6042455e14,The officer cited the co-complainant without c...,NS,False,False
11922,6a7aaab75268c5b0,The officer spoke and behaved inappropriately.,Unfounded,False,False
26032,e8af03e38ec5f3e9,The officer used profanity toward the complain...,NS,False,False


In [30]:
train.uof_kw.value_counts(normalize=True)

uof_kw
False    0.801667
True     0.198333
Name: proportion, dtype: float64

In [31]:
model = LogisticRegression()

In [39]:
x_train = train.uof_kw.values.reshape(-1, 1)
y_train = train.finding.values.reshape(-1, 1)
x_test = test.uof_kw.values.reshape(-1, 1)
y_test = test.finding.values.reshape(-1, 1)

In [44]:
model.fit(X=x_train, y=np.ravel(y_train,order="c"))

In [47]:
test['predicted_finding'] = model.predict(X=x_test)

In [49]:
test[colgroup + ['predicted_finding']]

Unnamed: 0,allegation_id,allegations,finding,dpa_added,uof_kw,predicted_finding
14305,7fb6c462a6b4e781,The officer failed to properly investigate.,Unfounded,False,False,NS
27753,f7a8d1a941292f41,The officers improperly seized the complainant...,Unfounded,False,False,NS
2086,12fa2e6ebd30cb5d,The officers failed to provide/summon medical ...,Proper Conduct,False,False,NS
9262,52c01bd445cec330,The officer made inappropriate comments and be...,NS,False,False,NS
5780,33ba6cc8ea583f5e,The officer failed to maintain required knowle...,Sustained,False,False,NS
...,...,...,...,...,...,...
19286,ac8c241f2503af98,The officers arrested the complainant without ...,Proper Conduct,False,False,NS
18657,a70baf41eb629bc2,The officer detained the complainant at gunpoi...,NS,False,True,NS
22235,c775936d8108ef20,The officer failed to properly investigate.,No Finding,False,False,NS
2014,124d91f5503de667,The officer failed to take required action.,Proper Conduct,False,True,NS


In [52]:
test.loc[test.allegation_id == '33ba6cc8ea583f5e'].allegations.values

array(['The officer failed to maintain required knowledge.'], dtype=object)

In [55]:
test.loc[(test.finding == 'Sustained') & (test.finding != test.predicted_finding), ['allegation_id', 'allegations', 'finding', 'predicted_finding']]

Unnamed: 0,allegation_id,allegations,finding,predicted_finding
5780,33ba6cc8ea583f5e,The officer failed to maintain required knowle...,Sustained,NS
1688,0f70a29f9e2c4c44,The officers failed to take required action.,Sustained,NS
21096,bcec2a2054ffac56,The officer behaved inappropriately and made i...,Sustained,NS
16766,957b4b9559ad04c8,The officers failed to comply with DGO 7.04 Ch...,Sustained,NS
413,0390cd7e21d4ce7d,The officer failed to take required action,Sustained,NS
...,...,...,...,...
13785,7b3b75eb09039c8d,The officer misused police authority.,Sustained,NS
16584,93b38c76780e8619,The named officers failed to comply with DGO 1...,Sustained,NS
19346,ad32b1d9b332314f,The officer issued an invalid order.,Sustained,NS
20669,b9118d825b8b6dbc,"The officer failed to comply with DGO 10.11, B...",Sustained,NS


In [56]:
test.loc[(test.finding == 'Sustained')].predicted_finding.value_counts()

predicted_finding
NS    223
Name: count, dtype: int64