TODO: this is not as expected - `dpa.finding.value_counts()`
- we should be grouping values like "IC/S" as "Improper Conduct; Sustained" or something better explained
this will affect classification accuracy negatively

# setup

### general

In [39]:
# dependencies
import re
import pandas as pd
import spacy
import spacy_udpipe
from rake_spacy import Rake

In [2]:
# support methods
def findkwphrase(chunks, skw, ekw):
    si = [i for i in range(len(chunks)) if (chunks[i] == skw.text) | (skw.text in chunks[i])][0]
    ei = [i for i in range(len(chunks)) if (chunks[i] == ekw.text) | (ekw.text in chunks[i])][0]
    if ei + 1 >= len(chunks): return ' '.join(chunks[si:])
    else: return ' '.join(chunks[si: ei + 1])


def findkwpos(pos, kws, skw, ekw):
    si = [i for i in range(len(kws)) if kws[i] == skw][0]
    ei = [i for i in range(len(kws)) if kws[i] == ekw][0]
    kwpos = pos[si: ei + 1]
    return kwpos


def findinfo(row):
    assert len(row.doc) == len(row.pos)
    chunks = row.doc.text.split()
    kws = row.kws
    skw, ekw = kws[0], kws[-1]
    phrase = findkwphrase(chunks=chunks, skw=skw, ekw=ekw)
    pos = findkwpos(pos=row.pos, kws=kws, skw=skw, ekw=ekw)
    return phrase, pos

def get_corecols():
    return [\
    'complaint_id', 'allegation_id', \
    'date_complained', 'date_completed', 'year_complained', 'year_completed',\
    'time_to_complete', 'ttc_group',\
    'report_type', 'n_complaint_pages',\
    'dpa_added', 'occ_added',\
    'allegations', 'findings_of_fact',\
    'category_of_conduct', 'finding', 'sustained', 'mediated', 'mediation_status', \
    'complaint_meta',\
    'allegation_text', \
    'pdf_url'
]


def get_kwcols():
    return [
        'allegation_id',
        'named_officers', 'no_officer_id',
        'default_finding', 'jlp',
        'resisting', 'force', 'bwc',
        'intimidation', 'racial_bias',
        'pursuit', 'swat', 'firearm', 'taser',
        'home', 'minor', 'crisis', 'missing_person',
        'action_wo_cause',
        'entry_wo_cause', 'search_wo_cause', 'towed_wo_cause', 'tookproperty_wo_cause',
        'cite_wo_cause', 'detain_wo_cause', 'arrest_wo_cause',
        'display_weapon', 'unnec_force', 'malignant_action',
        'dishonesty', 'bias',
        'inapp_action', 'malignant_action', 'failed_reqmt',
        'pdf_url'
    ]

In [3]:
# main
dpa = pd.read_parquet("../../export/output/complaints.parquet")
corecols = get_corecols()
kwcols = get_kwcols()

### batching & sampling

In [4]:
dpa.loc[dpa.finding.isin(('Proper Conduct', 'Sustained', 'Mediated')), [
    'allegation_id', 'allegations', 'finding', 'mediated']].dropna(subset='allegations').shape[0]

8521

In [5]:
# options for batching
# if processing data not in table
text = "\n\n".join(dpa.allegations.dropna().values)
batch0 = text[:100000]
# for processing as table
less = dpa[['allegation_id', 'allegations', 'finding', 'mediated']].head(1000)
# for a more selective table
picky = dpa.loc[dpa.finding.isin(dpa.finding.value_counts().head(5).index), [
    'allegation_id', 'allegations', 'finding', 'mediated']].dropna(subset='allegations').sample(10000)

### models

In [6]:
# p sure these nlp opts the same, at least as far as this example goes
#nlp = spacy_udpipe.load(lang="en")
nlp = spacy.load("en_core_web_sm")
r = Rake(nlp=nlp)

# process

### apply NLP model & unpack `spacy.tokens.doc.Doc` object

In [7]:
picky['doc'] = picky.allegations.apply(nlp)
picky['ents'] = picky.doc.apply(lambda x: x.ents if any(x.ents) else None)
picky['json'] = picky.doc.apply(lambda x: x.to_json())
picky['tokens'] = picky.json.apply(lambda x: x['tokens'])
picky['lemmas'] = picky.tokens.apply(lambda x: [token['lemma'] for token in x])
picky['pos'] = picky.tokens.apply(lambda x: [token['pos'] for token in x])
picky['dep'] = picky.tokens.apply(lambda x: [token['dep'] for token in x])

### apply `Rake` model & unpack

In [8]:
picky['ranklist'] = picky.doc.apply(lambda x: r.apply(x))

In [9]:
focus = picky[['allegation_id', 'doc', 'finding', 'mediated', 'pos', 'ranklist']].explode('ranklist')
focus['rank'] = focus.ranklist.apply(lambda x: x[0])
focus['kws'] = focus.ranklist.apply(lambda x: x[1])
focus['nkws'] = focus.kws.apply(len)
focus['kwinfo'] = focus[['doc', 'pos', 'kws']].apply(lambda x: findinfo(x), axis=1)
focus['kwphrase'] = focus.kwinfo.apply(lambda x: x[0])
focus['kwpos'] = focus.kwinfo.apply(lambda x: x[1])
intpos = ("NOUN", "ADJ", "VERB")
focus['posint'] = focus.kwpos.apply(lambda x: any((pos in intpos for pos in x)))
cands = focus.loc[(focus.nkws > 2) & (focus.posint)]

In [11]:
dpa[corecols].sample().T

Unnamed: 0,227
complaint_id,6b8ba8e0_38
allegation_id,01f4ce6eae57f5b1
date_complained,2005-05-11 00:00:00
date_completed,2005-10-24 00:00:00
year_complained,2005.0
year_completed,2005.0
time_to_complete,166 days 00:00:00
ttc_group,3 to 6 months
report_type,OCC
n_complaint_pages,2


In [12]:
picky.sample().T

Unnamed: 0,23570
allegation_id,d34447df77519be3
allegations,The officers searched the complainant’s reside...
finding,NS
mediated,False
doc,"(The, officers, searched, the, complainant, ’s..."
ents,
json,{'text': 'The officers searched the complainan...
tokens,"[{'id': 0, 'start': 0, 'end': 3, 'tag': 'DT', ..."
lemmas,"[the, officer, search, the, complainant, ’s, r..."
pos,"[DET, NOUN, VERB, DET, NOUN, PART, NOUN, CCONJ..."


In [13]:
cands.sample().T

Unnamed: 0,19409
allegation_id,adb134e5f19293c1
doc,"(The, officer, drove, improperly, .)"
finding,Proper Conduct
mediated,False
pos,"[DET, NOUN, VERB, ADV, PUNCT]"
ranklist,"(3.0, (officer, drove, improperly))"
rank,3.0
kws,"(officer, drove, improperly)"
nkws,3
kwinfo,"(officer drove improperly., [DET, NOUN, VERB])"


# Demo
### The `spacy.tokens.doc.Doc` object
As used in above code.

In [14]:
sample = cands.sample()

In [15]:
test = sample.doc.values[0]

In [16]:
test

The officer failed to properly process property.

In [17]:
type(test)

spacy.tokens.doc.Doc

In [18]:
test.ents

()

In [19]:
testjson = test.to_json()

In [20]:
testjson.keys()

dict_keys(['text', 'ents', 'sents', 'tokens'])

In [21]:
testjson['text']

'The officer failed to properly process property.'

In [22]:
len(testjson['text'])

48

In [23]:
testjson['sents']

[{'start': 0, 'end': 48}]

In [24]:
testjson['tokens'][0]

{'id': 0,
 'start': 0,
 'end': 3,
 'tag': 'DT',
 'pos': 'DET',
 'morph': 'Definite=Def|PronType=Art',
 'lemma': 'the',
 'dep': 'det',
 'head': 1}

In [25]:
[token['lemma'] for token in testjson['tokens']]

['the', 'officer', 'fail', 'to', 'properly', 'process', 'property', '.']

In [26]:
[token['pos'] for token in testjson['tokens']]

['DET', 'NOUN', 'VERB', 'PART', 'ADV', 'VERB', 'NOUN', 'PUNCT']

In [27]:
[token['dep'] for token in testjson['tokens']]

['det', 'nsubj', 'ROOT', 'aux', 'advmod', 'xcomp', 'dobj', 'punct']

# keyword rake

In [28]:
ranklist = r.apply(test)

In [29]:
ranklist

[(3.0, properly process property), (2.0, officer failed)]

In [30]:
sample.ranklist

17645    (3.0, (properly, process, property))
Name: ranklist, dtype: object

# review candidates

In [31]:
cands.nkws.value_counts()

nkws
3    1635
4     421
5      54
6      15
7       1
Name: count, dtype: int64

In [41]:
cands.loc[(cands.finding.str.contains("sust", flags=re.I)), 'kwphrase'].value_counts().head(10)

kwphrase
officer behaved inappropriately                      12
Body Worn Cameras.                                   10
properly process property.                            8
officer behaved inappropriately.                      6
Language Access Services                              4
accurate incident report.                             4
Limited English Proficient                            4
Traffic Stop Data Collection Program Information.     3
officers unlawfully entered                           3
incomplete incident report.                           3
Name: count, dtype: int64

In [42]:
cands.loc[(cands.finding.str.contains("NS", flags=re.I)), 'kwphrase'].value_counts().head(10)

kwphrase
officer behaved inappropriately              119
officer behaved inappropriately.              78
properly process property.                    49
officer behaved inappropriately and/or        41
officers behaved inappropriately              32
officer exhibited inappropriate behavior.     26
officers behaved inappropriately.             23
biased policing based                         21
officer drove improperly.                     18
racially derogatory comment.                  17
Name: count, dtype: int64