In [1]:
# dependencies
import re
import pandas as pd

In [2]:
# support methods
def get_real_cols():
    return [\
    'complaint_id', 'allegation_id', \
    'date_complained', 'date_completed', \
    'easy_date_complained', 'easy_date_completed', \
    'report_type', 'n_complaint_pages',\
    'dpa_added', 'occ_added', 'named_officers', \
    'allegations', 'findings_of_fact',\
    'category_of_conduct', 'finding', 'sustained', 'mediation_status', \
    'outside_jurisdiction', 'withdrawn',\
    'complaint_meta',\
    'allegation_text', \
    'pdf_url'
]


def get_kw_cols():
    return [ \
            'allegation_id',\
            'no_officer_id',\
            'default_finding', 'jlp',\
            'resisting', 'force', 'bwc', \
            'intimidation', 'racial_bias', \
            'pursuit', 'swat', 'firearm', 'taser', \
            'home', 'minor', 'crisis', 'missing_person', \
            'pdf_url'
    ]

In [3]:
# main
datacols = get_real_cols()
kwcols = get_kw_cols()

complaints = pd.read_parquet("../../clean/output/complaints.parquet")
exported = pd.read_parquet("../output/complaints.parquet")

data = complaints[datacols].copy()
kws = complaints[kwcols].copy()
sustained = data.loc[data.sustained == 1]
added = data.loc[(data.dpa_added) | (data.occ_added)]
mediated = data.loc[data.mediation_status.notna()]
officers = data.loc[data.named_officers.notna()]

## unique allegations

In [4]:
len(data.allegation_id.unique()) == data.shape[0]

True

In [5]:
any(data.allegation_id.duplicated())

False

In [6]:
data.allegation_id.duplicated().sum()

0

In [7]:
dup_allids = data.loc[data.allegation_id.duplicated(), 'allegation_id'].unique()
data.loc[data.allegation_id == 'fd8fcae4']

Unnamed: 0,complaint_id,allegation_id,date_complained,date_completed,easy_date_complained,easy_date_completed,report_type,n_complaint_pages,dpa_added,occ_added,...,findings_of_fact,category_of_conduct,finding,sustained,mediation_status,outside_jurisdiction,withdrawn,complaint_meta,allegation_text,pdf_url


In [8]:
any(data.loc[data.outside_jurisdiction == False].allegation_id.duplicated())

False

In [9]:
any(data.allegation_text.duplicated())

True

In [10]:
data.loc[data.allegation_text.duplicated(), 'allegation_text'].value_counts()

allegation_text
SUMMARY OF ALLEGATION #1: This complaint raises matters not rationally within DPA jurisdiction. CATEGORY OF CONDUCT:               FINDING:      IO-2       DEPT. ACTION: FINDINGS OF FACT: This complaint raises matters not rationally within DPA jurisdiction.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

## rates of missing fields

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26740 entries, 0 to 27803
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   complaint_id          26740 non-null  object        
 1   allegation_id         26740 non-null  object        
 2   date_complained       26530 non-null  object        
 3   date_completed        26552 non-null  object        
 4   easy_date_complained  26482 non-null  datetime64[ns]
 5   easy_date_completed   26499 non-null  datetime64[ns]
 6   report_type           26740 non-null  object        
 7   n_complaint_pages     26559 non-null  object        
 8   dpa_added             26740 non-null  bool          
 9   occ_added             26740 non-null  bool          
 10  named_officers        13 non-null     object        
 11  allegations           26740 non-null  object        
 12  findings_of_fact      26284 non-null  object        
 13  category_of_conduct  

## report type

In [12]:
data.report_type.value_counts()

report_type
OCC    17259
DPA     9481
Name: count, dtype: int64

## complaint dates

In [13]:
data.easy_date_complained.describe()

count                            26482
mean     2012-09-22 09:32:25.321350400
min                2001-07-17 00:00:00
25%                2007-01-26 00:00:00
50%                2013-11-16 00:00:00
75%                2018-03-02 00:00:00
max                2023-06-07 00:00:00
Name: easy_date_complained, dtype: object

## completed dates

In [14]:
data.easy_date_completed.describe()

count                            26499
mean     2013-05-16 01:17:00.129061376
min                2004-12-31 00:00:00
25%                2007-08-27 00:00:00
50%                2014-07-09 00:00:00
75%                2018-09-10 00:00:00
max                2023-06-23 00:00:00
Name: easy_date_completed, dtype: object

## sustained allegations

In [15]:
sustained.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 222 to 27680
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   complaint_id          1144 non-null   object        
 1   allegation_id         1144 non-null   object        
 2   date_complained       1136 non-null   object        
 3   date_completed        1132 non-null   object        
 4   easy_date_complained  1135 non-null   datetime64[ns]
 5   easy_date_completed   1132 non-null   datetime64[ns]
 6   report_type           1144 non-null   object        
 7   n_complaint_pages     1138 non-null   object        
 8   dpa_added             1144 non-null   bool          
 9   occ_added             1144 non-null   bool          
 10  named_officers        0 non-null      object        
 11  allegations           1144 non-null   object        
 12  findings_of_fact      1136 non-null   object        
 13  category_of_conduct 

## DPA- or OCC-added allegations

In [16]:
data.loc[(data.dpa_added) | (data.occ_added), 
         'allegation_text'].values

array(['SUMMARY OF DPA-ADDED ALLEGATIONS #1-3: The officers failed to activate their body-worn cameras as required. CATEGORY OF CONDUCT:                    ND        FINDING:           IC/S       DEPT. ACTION: FINDINGS OF FACT: During the investigation, DPA found that the named officers failed to timely activate his body-worn camera, as required. Named Officer #1 stated he activated his body-worn camera in the elevator on his way to speak with the complainant. He admitted that he did not activate his camera enroute to the scene. Named Officer #2 stated that he did not activate his body-worn camera enroute to this call and said he activated it in the elevator. He did not remember why he did not activate it while enroute but once he realized it was not activated, he immediately activated it in the elevator on the way to meet the complainant. The officer did state that he met with the front desk worker of the Hostel when he first arrived on scene. Named Officer #3 stated he did not activa