# Notes
\
From TS:
> "This is a great start! I'd love to see the distribution of outcomes being predicted. Specifically **wondering how unbalanced the labels are (basically how rare are sustained findings)**, to think about sampling in such a way to **oversample the sustained findings for the train split**. And of course **more fine-grained features**, currently we have a single T/F for any of a collection of keywords expected to be relevant, but perhaps (1) distinct features for different keywords/phrases, or (2) denser topic models or (3) other embeddings. Just a thought."

Next to explore

> (2) denser topic models
>
> (3) other embeddings

# setup - general

In [1]:
# dependencies
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# support methods
def group_finding(x):
    if x.sustained | x.mediated: return True
    return False


def combine_indicators(x):
    true = sorted([c for c in top_inds if x[c] == True])
    if not any(true): return 'other_factor(s)'
    return "|".join(true)

In [3]:
# main
raw = pd.read_parquet("../../export/output/complaints.parquet")

basecols = ['allegation_id', 'allegations', 'finding',]
# metadata about the complaint/allegation
meta = ['report_type', 'dpa_added', 'occ_added']
# phrases that suggest something about the investigation into the allegation
proc = ['no_officer_id', 'default_finding', 'withdrawn', 'jlp', 'bwc', 'mediated']
# potentials values for use of force allegation: CA P.C. 835, SFPD G.O. 5.01
kw_g1 = ['intimidation', 'racial_bias', 'resisting', 'force', 'pursuit', 'swat', 'firearm', 'taser', 'crisis']
kw_g2 = ['home', 'minor',]
raw['uof_kw'] = raw[kw_g1].apply(lambda x: any(x.values), axis=1)

In [4]:
raw['eligible'] = ~(raw.dpa_added | raw.occ_added | raw.withdrawn)
raw['sus_or_med'] = raw.apply(group_finding, axis=1)

# data subset options

In [5]:
categories = [
    'category_of_conduct',
    'finding'
]
base_ind = [c for c in raw.columns if 'wo_cause' in c]
indicators = base_ind + [
    'bias',
    'bwc',
    'crisis',
    'dishonesty',
    'failed_reqmt',
    'firearm',
    'force',
    'inapp_action',
    'jlp',
    'malignant_action',
    'minor',
    'pursuit',
    'racial_bias',
    'resisting',
    'sus_or_med',
    'unnec_force',
]
includecols = ['allegation_id',] + categories + indicators

In [6]:
# subset the data with mostly-cleaned allegations
dpa = raw.loc[raw.eligible, includecols].drop_duplicates().dropna()
picky = raw.loc[(
    raw.eligible) & (
    raw.finding.isin(raw.finding.value_counts().head(5).index)),
    includecols].sample(10000)

In [7]:
top_inds = [
    col for col,highprop in (
    dpa[indicators].sum() > 3000).to_dict().items()
    if highprop]
focuscols = ['allegation_id', 'sus_or_med'] + top_inds
dpa['factor_group'] = dpa[top_inds].apply(
    lambda x: combine_indicators(x), axis=1)

# setup - model

In [8]:
npos = dpa.sus_or_med.sum()
pos = dpa.loc[dpa.sus_or_med, focuscols]
neg = dpa.loc[~(dpa.sus_or_med), focuscols].sample(npos)
truesplit = pd.concat([pos, neg])

train, test = train_test_split(
    truesplit,
    test_size=.4, train_size=.6,
    random_state=42, shuffle=True)

In [9]:
model = LogisticRegression()

x_train = train[top_inds].to_numpy()
y_train = train.sus_or_med.values.reshape(-1, 1)
x_test = test[top_inds].to_numpy()
y_test = test.sus_or_med.values.reshape(-1, 1)

model.fit(X=x_train, y=np.ravel(y_train,order="c"))

test['predicted_finding'] = model.predict(X=x_test)

In [10]:
print("Accuracy:", accuracy_score(y_test, test.predicted_finding))
print(classification_report(y_test, test.predicted_finding))

Accuracy: 0.6247113163972287
              precision    recall  f1-score   support

       False       0.70      0.43      0.53       858
        True       0.59      0.81      0.69       874

    accuracy                           0.62      1732
   macro avg       0.64      0.62      0.61      1732
weighted avg       0.64      0.62      0.61      1732



# Preview data

In [11]:
dpa.shape[0]

23327

In [12]:
picky.shape[0]

10000

> **wondering how unbalanced the labels are (basically how rare are sustained findings)**

In [13]:
dpa.sus_or_med.value_counts(normalize=True)

sus_or_med
False    0.907232
True     0.092768
Name: proportion, dtype: float64

> think about sampling in such a way to **oversample the sustained findings for the train split**

In [14]:
truesplit.sus_or_med.value_counts(normalize=True)

sus_or_med
True     0.5
False    0.5
Name: proportion, dtype: float64

In [15]:
train.sus_or_med.value_counts(normalize=True)

sus_or_med
False    0.503082
True     0.496918
Name: proportion, dtype: float64

> **more fine-grained features**, currently we have a single T/F for any of a collection of keywords expected to be relevant, but perhaps (1) distinct features for different keywords/phrases

In [17]:
dpa[indicators].sum().sort_values(ascending=False)

failed_reqmt             6292
action_wo_cause          6090
inapp_action             5843
bwc                      3269
force                    3236
jlp                      3101
sus_or_med               2164
unnec_force              1989
detain_wo_cause          1950
malignant_action         1241
arrest_wo_cause          1171
cite_wo_cause            1073
firearm                   924
search_wo_cause           898
bias                      716
racial_bias               520
resisting                 392
minor                     385
entry_wo_cause            362
towed_wo_cause            236
dishonesty                230
tookproperty_wo_cause     187
pursuit                   142
crisis                     26
dtype: int64

In [18]:
dpa[top_inds].sum().sort_values(ascending=False)

failed_reqmt       6292
action_wo_cause    6090
inapp_action       5843
bwc                3269
force              3236
jlp                3101
dtype: int64

In [19]:
dpa[top_inds].value_counts().head(10)#.sort_values(ascending=False)

action_wo_cause  bwc    failed_reqmt  force  inapp_action  jlp  
False            False  False         False  True          False    4629
True             False  False         False  False         False    4578
False            False  True          False  False         False    4381
                        False         False  False         False    2463
                                      True   False         False    1811
True             False  False         False  False         True      661
False            True   False         False  True          False     502
                 False  True          False  False         True      498
                 True   True          False  False         True      472
                                                           False     465
Name: count, dtype: int64

In [20]:
dpa[focuscols].sample().T

Unnamed: 0,15395
allegation_id,890d6e351425b85b
sus_or_med,False
action_wo_cause,False
bwc,False
failed_reqmt,False
force,True
inapp_action,False
jlp,False


\
**BP says:**
- I'm wondering if the `sustained` indicator is being calculated accurately
- This looks like the thing I might've broken / not fixed last time related to the `findings` regrouping

In [21]:
raw[['sustained', 'sus_or_med', 'finding']].value_counts().head(10)

sustained  sus_or_med  finding                     
False      False       NS                              10071
                       Proper Conduct                   6338
                       NF                               2363
                       Unfounded                        2320
True       True        Sustained                        1143
False      True        Mediated                         1041
           False       NF/W                              883
                       Insufficient Evidence             647
                       Improper Conduct (Sustained)      210
True       True        Policy Failure                    171
Name: count, dtype: int64

# Review predictions

In [29]:
test.predicted_finding.value_counts(normalize=True)

predicted_finding
True     0.692841
False    0.307159
Name: proportion, dtype: float64

In [30]:
test[['sus_or_med', 'predicted_finding']].value_counts()

sus_or_med  predicted_finding
True        True                 712
False       True                 488
            False                370
True        False                162
Name: count, dtype: int64

In [24]:
truepos =  test.loc[(test.sus_or_med) & (test.predicted_finding)]
falsepos = test.loc[~(test.sus_or_med) & (test.predicted_finding)]
trueneg =  test.loc[~(test.sus_or_med | test.predicted_finding)]
falseneg = test.loc[(test.sus_or_med) & ~(test.predicted_finding)]

In [None]:
truepos.shape[0], falsepos.shape[0], trueneg.shape[0], falseneg.shape[0]

In [67]:
truepos_n = truepos[top_inds].sum(
    ).to_frame().rename(columns={0: 'n_truepos'}).T
truepos_perc = truepos[top_inds].sum().apply(
    lambda x: f"{x/truepos.shape[0]*100:.1f}%"
    ).to_frame().rename(columns={0: 'perc_truepos'}).T
trueneg_n = trueneg[top_inds].sum(
    ).to_frame().rename(columns={0: 'n_trueneg'}).T
trueneg_perc = trueneg[top_inds].sum().apply(
    lambda x: f"{x/trueneg.shape[0]*100:.1f}%"
    ).to_frame().rename(columns={0: 'perc_trueneg'}).T
falsepos_n = falsepos[top_inds].sum(
    ).to_frame().rename(columns={0: 'n_falsepos'}).T
falsepos_perc = falsepos[top_inds].sum().apply(
    lambda x: f"{x/falsepos.shape[0]*100:.1f}%"
    ).to_frame().rename(columns={0: 'perc_falsepos'}).T
falseneg_n = falseneg[top_inds].sum(
    ).to_frame().rename(columns={0: 'n_falseneg'}).T
falseneg_perc = falseneg[top_inds].sum().apply(
    lambda x: f"{x/falseneg.shape[0]*100:.1f}%"
    ).to_frame().rename(columns={0: 'perc_falseneg'}).T
results = pd.concat([
    truepos_n, truepos_perc,
    trueneg_n, trueneg_perc,
    falsepos_n, falsepos_perc,
    falseneg_n, falseneg_perc])

In [70]:
results.T.sort_values('n_truepos', ascending=False)

Unnamed: 0,n_truepos,perc_truepos,n_trueneg,perc_trueneg,n_falsepos,perc_falsepos,n_falseneg,perc_falseneg
failed_reqmt,325,45.6%,71,19.2%,146,29.9%,17,10.5%
inapp_action,225,31.6%,57,15.4%,161,33.0%,13,8.0%
action_wo_cause,164,23.0%,54,14.6%,181,37.1%,9,5.6%
force,46,6.5%,106,28.6%,16,3.3%,49,30.2%
bwc,0,0.0%,140,37.8%,0,0.0%,46,28.4%
jlp,0,0.0%,143,38.6%,0,0.0%,0,0.0%


In [75]:
truepos[top_inds].sum()

action_wo_cause    164
bwc                  0
failed_reqmt       325
force               46
inapp_action       225
jlp                  0
dtype: int64