# goal
\
demo the OEMC dataset ahead of analysis

- note that the way I format districts may not be the best thing. Currently, I:\
    (1) preserve the original field,\
    (2) add a numeric version that removes padding 0s and formats as int,\
    (3) add a non-numeric version that preserves fields with a potentially meaningful non-numeric pattern, ie) "CW3".\
  The idea was to use the numeric version because that matches up with the district numbering in the open data portal, but that might not be the right call.
- note also that the `kwfields.yml` is not an exhaustive groupings of the `init_type` and `fin_type`s in the data and could be expanded on
- I don't know what "EL CHECK" is

# setup

In [1]:
# dependencies
import yaml
import re
import numpy as np
import pandas as pd

In [2]:
# support methods
def readyaml(fname):
    with open(fname, 'r') as f:
        data = yaml.safe_load(f)
    return data


def group_timedelta(td):
    if pd.isna(td): return 'No dispatch reported'
    elif td < pd.Timedelta(0): return 'Dispatch before call'
    elif td < pd.Timedelta(minutes=5): return 'Dispatch under 5 minutes'
    elif td < pd.Timedelta(minutes=15): return 'Dispatch under 15 minutes'
    elif td < pd.Timedelta(minutes=30): return 'Dispatch under 30 minutes'
    elif td < pd.Timedelta(minutes=60): return 'Dispatch under 1 hour'
    elif td < pd.Timedelta(minutes=120): return 'Dispatch under 2 hours'
    elif td < pd.Timedelta(minutes=360): return 'Dispatch under 6 hours'
    elif td < pd.Timedelta(days=.5): return 'Dispatch under 12 hours'
    elif td < pd.Timedelta(days=1): return 'Dispatch under 24 hours'
    elif td < pd.Timedelta(days=2): return 'Dispatch under 48 hours'
    return 'Dispatch 48 hours or later'


def format_district(v):
    clean = v.replace('00', '')
    if clean[0] == '0': clean = clean[1:]
    if not clean.isdigit(): return np.nan
    return int(clean.strip())


# data prep ahead of analysis
def add_suppcols(df):
    copy = df.copy()
    print('adding supplementary datetime fields')
    copy['year_called'] = copy.call_date.dt.year
    copy['date_of_call'] = copy.call_date.dt.date
    copy['time_to_dispatch'] = copy.disp_date - copy.call_date
    copy['ttd_group'] = copy.time_to_dispatch.apply(group_timedelta)
    print('adding supplemental indicator fields')
    copy['dispatch_reported'] = copy.disp_date.notna()
    copy['closure_reported'] = copy.close_date.notna()
    copy['no_initial_type'] = copy.init_type.isna()
    print('adding supplemental district fields')
    copy['numeric_district'] = copy.district.apply(format_district)
    copy['nonnumeric_district'] = copy.district.apply(
        lambda x: x if not x.isdigit() else None)
    return copy


# regular expression fields
def add_ovcol(df):
    copy = df.copy()
    print("adding indicator for reported 'ON VIEW' activity")
    onview_re = re.escape("(OV)") + "|" + re.escape("[OV]") + "|" + "ON[ ]*VIEW"
    copy['init_on_view'] = copy.init_type.str.contains(onview_re, na=False)
    copy['fin_on_view'] = copy.fin_type.str.contains(onview_re, na=False)
    return copy


def group_events(df, rules):
    copy = df.copy()
    copy['event_group'] = None
    copy['event_type'] = None
    for kwgroup, groupinfo in kwrules.items():
        print(f"adding `event_group` category '{kwgroup}'")
        kwre = "|".join([re.escape(kw)
                         for cat, kws in groupinfo.items()
                         for kw in kws])
        mask = (copy.init_type.str.contains(kwre, na=False, flags=re.I)) | (\
                copy.fin_type.str.contains(kwre, na=False, flags=re.I))
        copy.loc[mask, 'event_group'] = kwgroup
        for cat, kws in groupinfo.items():
            print(f"adding `event_type` category '{cat}'")
            kwre = "|".join([re.escape(kw) for kw in kws])
            mask = (copy.init_type.str.contains(kwre, na=False, flags=re.I)) | (\
                    copy.fin_type.str.contains(kwre, na=False, flags=re.I))
            copy.loc[mask, 'event_type'] = cat
    copy.event_group = copy.event_group.fillna('other')
    copy.event_type = copy.event_type.fillna('other')
    return copy


def add_flaggedcol(df):
    copy = df.copy()
    other = ['FIREWORKS', 'GENERIC (CHANGE PRIORITY)', 'SUSPECTED 911 MISUSE']
    other_re = "|".join([re.escape(kw) for kw in other])
    copy.loc[(
        copy.init_type.str.contains(other_re, na=False)) | (
        copy.fin_type.str.contains(other_re, na=False)),
        'event_type'] = 'shotspotter_reclass?'
    copy['flagged'] = copy.event_type == 'shotspotter_reclass?'
    return copy

In [3]:
# main
oemc = pd.read_parquet("../../data/OEMC_MP/import/output/oemc_dispatch.parquet")
kwrules = readyaml("../../data/shared/hand/keywords.yml")
assert oemc.shape[0] > 12000000
assert not oemc.fin_type.isna().any()

oemc = add_suppcols(oemc)
oemc = add_ovcol(df=oemc)
oemc = group_events(df=oemc, rules=kwrules)
oemc = add_flaggedcol(oemc)

oemc.to_parquet("../../data/OEMC_MP/export/output/oemc-prepped.parquet")

adding supplementary datetime fields
adding supplemental indicator fields
adding supplemental district fields
adding indicator for reported 'ON VIEW' activity
adding `event_group` category 'surveil'
adding `event_type` category 'shotspotter'
adding `event_type` category 'hunchlab'
adding `event_type` category 'licplate'
adding `event_type` category 'pod'
adding `event_type` category 'general'
adding `event_type` category 'traffic'
adding `event_type` category 'patrol'
adding `event_group` category 'help'
adding `event_type` category 'gun'
adding `event_type` category 'mp'
adding `event_type` category 'injury_report'
adding `event_type` category 'noinjury_report'


# prep for analysis

- I thought this stuff was done in the version in `Chi-MP-data-story`, but it looks like I actually added these fields later and the version in the public repo is only lightly processed, not prepped for analysis.

**NOTE:** I'm not sure if the numeric district thing is the right approach or if it needs some tweaking, open to feedback and should review this before/when utilizing

In [4]:
oemc.loc[oemc.event_type == 'gun', ['event_group', 'event_type', 'init_type', 'fin_type']
].fillna("NO INITIAL TYPE").value_counts().head(50)

event_group  event_type  init_type                    fin_type                
help         gun         SHOTS FIRED                  SHOTS FIRED                 71365
                         SHOTS FIRED (OV)             SHOTS FIRED (OV)            12671
                         PERSON SHOT                  PERSON SHOT                  8165
                         SHOTS FIRED                  PERSON SHOT                   173
                         NO INITIAL TYPE              SHOTS FIRED (OV)              120
                         SHOTS FIRED (OV)             SHOT SPOTTER                   94
                         TRAFFIC STOP (OV)            SHOTS FIRED (OV)               90
                         SHOT SPOTTER                 SHOTS FIRED (OV)               78
                                                      PERSON SHOT                    72
                         PERSON SHOT                  SHOTS FIRED                    56
                         SHOTS FIRED (OV)

In [5]:
oemc[['event_group', 'event_type', 'init_type', 'fin_type']
].fillna("NO INITIAL TYPE").value_counts().head(50)

event_group  event_type            init_type                  fin_type                 
surveil      traffic               TRAFFIC STOP (OV)          TRAFFIC STOP (OV)            2612639
             patrol                Community Interaction      Community Interaction         782470
other        other                 DISTURBANCE                DISTURBANCE                   541972
help         noinjury_report       AUTO ACCIDENT PD           AUTO ACCIDENT PD              452511
                                   DOMESTIC DISTURBANCE       DOMESTIC DISTURBANCE          412691
surveil      general               MISSION                    MISSION                       335607
help         noinjury_report       ALARM BURGLAR              ALARM BURGLAR                 298438
other        other                 EL CHECK [OV]              EL CHECK [OV]                 279423
help         noinjury_report       CHECK WELL BEING           CHECK WELL BEING              248942
surveil      traffic 

In [6]:
oemc.init_type.isna().sum()

np.int64(11921)

In [7]:
oemc.loc[oemc.init_type.isna(), 'fin_type'].value_counts()

fin_type
TRAFFIC STOP (OV)        6429
SHOT SPOTTER             2956
STREET STOP               533
Community Interaction     526
DISTURBANCE               164
                         ... 
CHILD LEFT ALONE            1
ANIMAL ABUSE                1
CRIMINAL TRESPASS IP        1
PROSTITUTION                1
THEFT JO                    1
Name: count, Length: 102, dtype: int64

# preview data

In [8]:
qa = f"Q:\tHow many records are in the OEMC dispatch data?\nA:\t{oemc.shape[0]:,} records"
print(qa)

Q:	How many records are in the OEMC dispatch data?
A:	12,159,582 records


In [9]:
oemc.sample().T

Unnamed: 0,1705284
event_no,1820102739
district,002
call_date,2018-07-20 06:35:00
disp_date,NaT
on_date,NaT
clear_date,NaT
close_date,2018-07-20 06:35:00
init_priority,3D
init_type,CRIM DAM. TO PROP RPT
fin_type,CRIM DAM. TO PROP RPT
