In [1]:
# dependencies
import pandas as pd
import pke

In [2]:
# support methods

In [3]:
# main
complaints = pd.read_parquet("../../export/output/complaints.parquet")
complaints['other_finding'] = ~(complaints.sustained | complaints.mediated)

In [4]:
idcols = [
    'fileid',
    'complaint_id',
    'allegation_id',
]
corecols = idcols + [
    'year_complained',
    'year_completed',
    'month_complained',
    'month_completed',
    'date_complained',
    'date_completed',
    'time_to_complete',
    'ttc_group',
    'report_type',
    'dpa_added',
    'occ_added',
    'outside_jurisdiction',
    'complaint_meta',
    'n_complaint_pages',
    'allegation_start_page',
    'allegation_text',
    'allegations',
    'category_of_conduct',
    'category_of_conduct_original',
    'finding',
    'finding_original',
    'findings_of_fact',
    'mediated',
    'sustained',
    'other_finding',
    'txt_file',
    'pdf_url',
]
indcols = idcols + [col for col in complaints.columns if col not in corecols]

In [5]:
complaints[corecols].sample().T

Unnamed: 0,14378
fileid,b51e32b1
complaint_id,b51e32b1_18
allegation_id,80f5603868f66dc2
year_complained,2013.0
year_completed,2014.0
month_complained,9.0
month_completed,5.0
date_complained,2013-09-12 00:00:00
date_completed,2014-05-29 00:00:00
time_to_complete,259 days 00:00:00


In [6]:
complaints.allegation_text.sample().values

array(['SUMMARY OF ALLEGATION #12: The officer wrote an inaccurate and incomplete statement. CATEGORY OF CONDUCT: ND                       FINDING: U             DEPT. ACTION: FINDINGS OF FACT: The co-complainant stated that the named officer’s statement attached to the incident report was inaccurate. She denied that the scene was out of control and denied that the officer had his star on his outermost clothing and identified himself. The officer denied the allegation. He stated he announced himself as a police officer and wore his police tar on his outermost clothing. A photo taken by a witness at the scene indicated the named officer had his police star on his outermost clothing while at the scene. Department records indicated that an officer on patrol viewed a physical altercation in progress and as the officer attempted to break up the altercation, other parties interfered prompting the officer to request additional units. The evidence proved that the act alleged in the complaint d

In [7]:
complaints[['allegation_text', 'sustained', 'mediated', 'finding']]

Unnamed: 0,allegation_text,sustained,mediated,finding
0,SUMMARY OF ALLEGATION #1: The officer detained...,False,True,Mediated
1,SUMMARY OF ALLEGATION #1: The officer detained...,False,False,NS
2,SUMMARY OF ALLEGATIONS #3 continued: At 9:33 p...,False,False,
3,SUMMARY OF ALLEGATIONS 1 and 2: The officers u...,False,False,NS
5,SUMMARY OF ALLEGATION #4: The officer lied. CA...,False,False,No Finding
...,...,...,...,...
28560,SUMMARY OF ALLEGATION #1: The officer failed t...,False,False,Proper Conduct
28561,SUMMARY OF ALLEGATIONS #3-4: The officers fail...,False,False,NS
28562,SUMMARY OF ALLEGATION #4: The officer conducte...,False,False,Withdrawn
28563,SUMMARY OF ALLEGATION #3: The officer used pro...,False,False,NS


In [18]:
# initialize keyphrase extraction model, here TopicRank
topic_extractor = pke.unsupervised.TopicRank()
multi_extractor = pke.unsupervised.MultipartiteRank()
textr_extractor = pke.unsupervised.TextRank()
topic2_extractor = pke.unsupervised.TopicalPageRank()

In [26]:
nrows = complaints.shape[0]
batches = []
for i in range(0, nrows, 900):
    if i + 900 > nrows: end = nrows
    else: end = i + 900
    newbatch = "\n".join(complaints.iloc[i:end].allegation_text.values)
    batches.append(newbatch)

In [60]:
def joinstr(x):
    if pd.isna(x.allegations):
        print("missing allegations")
    if pd.isna(x.findings_of_fact):
        print("missing findings of fact")
        print(x.allegation_id)
        print(x.allegation_text)
    return "\n".join((x.allegations, x.findings_of_fact))

In [None]:
complaints

In [62]:
assert not complaints.allegation_id.duplicated().any()

In [61]:
complaints[['allegation_id', 'allegations', 'allegation_text', 'findings_of_fact']].apply(
    lambda x: joinstr(x), axis=1
)

missing findings of fact
0002469b0c5c4fee
SUMMARY OF ALLEGATIONS #3 continued: At 9:33 p.m., the victim called 911, stated that she and her roommate had called earlier and that she wa getting more scared because of it being an escalating domestic situation. Dispatch informed her that the officers were on scene. One of the named officers understood the call for service involved a boyfriend and girlfriend arguing and the boyfriend was outside, ringing the doorbell. The officers found the ex-boyfriend sitting on the front teps of the apartment, between the locked metal gate and the front door. The officers pat-searched the ex-boyfriend for weapons and found none. One of the named officers recalled that the ex-boyfriend appeared somewhat under the influence. The victim spoke with one of the named officers, said that she and her ex-boyfriend had argued earlier in the evening and she had asked him to move out. According to one of the named officers, she denied that her ex-boyfriend was tryin

TypeError: sequence item 1: expected str instance, NoneType found

In [36]:
topic_extractor.candidate_selection?

[0;31mSignature:[0m [0mtopic_extractor[0m[0;34m.[0m[0mcandidate_selection[0m[0;34m([0m[0mpos[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Selects longest sequences of nouns and adjectives as keyphrase
candidates.

Args:
    pos (set): the set of valid POS tags, defaults to ('NOUN',
        'PROPN', 'ADJ').
[0;31mFile:[0m      ~/opt/miniconda3/lib/python3.12/site-packages/pke/unsupervised/graph_based/topicrank.py
[0;31mType:[0m      method

In [28]:
topic_extractor.load_document(input=batches[0], language='en')
topic_extractor.candidate_selection()
topic_extractor.candidate_weighting()

In [32]:
keyphrases = topic_extractor.get_n_best(n=20)

In [33]:
keyphrases

[('officer', 0.09580989480011193),
 ('complainant', 0.07312538923822846),
 ('finding', 0.03664249718169942),
 ('allegation', 0.030333566540713026),
 ('conduct', 0.022895513113075607),
 ('action', 0.021692063980007243),
 ('fact', 0.020000942717285402),
 ('category', 0.019344470768015997),
 ('summary', 0.018834734372133982),
 ('dept', 0.01780724432152054),
 ('date', 0.01469508558882679),
 ('evidence', 0.011663137173917076),
 ('complaint', 0.011358533998833915),
 ('vehicle', 0.008251570790537162),
 ('witnesses', 0.00808166104391796),
 ('citizen complaints', 0.008010195787387124),
 ('incident', 0.007741829453879393),
 ('insufficient evidence', 0.007004951329708432),
 ('completion', 0.006630582415737723),
 ('act', 0.005197017012686152)]