This file creates the datasets, generates pre-trend variables, merges LEOKA and UCR data, and exports versions of the data for use in regressions 1 and 2 (no force size) and regression 3 (incorporates force size data).

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore

In [4]:
#import LEOKA (law enforcement officers killed and assaulted) data

leokas = ['ucr_leoka_monthly_1960_2020_dta/leoka_monthly_2017.dta',
          'ucr_leoka_monthly_1960_2020_dta/leoka_monthly_2016.dta',
          'ucr_leoka_monthly_1960_2020_dta/leoka_monthly_2015.dta',
          'ucr_leoka_monthly_1960_2020_dta/leoka_monthly_2014.dta',
          'ucr_leoka_monthly_1960_2020_dta/leoka_monthly_2013.dta']
leoka_17, leoka_16, leoka_15, leoka_14, leoka_13 = pd.read_stata(leokas[0]), pd.read_stata(leokas[1]), pd.read_stata(leokas[2]), pd.read_stata(leokas[3]), pd.read_stata(leokas[4])

dfs = [leoka_13, leoka_14, leoka_15, leoka_16, leoka_17]
dfs = [i.iloc[:,:-198] for i in dfs]

for j in dfs:
    print(j.shape)

(266424, 58)
(267972, 58)
(270288, 58)
(271740, 58)
(273408, 58)


In [3]:
df = pd.concat(dfs, ignore_index=True)
df.shape

(1349832, 58)

In [4]:
#map months to integers up to 60 for ease of computation
month_map = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12}
df['month_code'] = df['month'].map(month_map) + (df['year'] - 2013)*12

In [5]:
df.head(10)

Unnamed: 0,ori,agency_name,state,state_abb,number_of_months_reported,year,month,date,ori9,fips_state_code,...,assaults_with_injury_knife,assaults_with_injury_oth_weap,assaults_with_injury_unarmed,assaults_with_injury_total,assaults_no_injury_gun,assaults_no_injury_knife,assaults_no_injury_oth_weap,assaults_no_injury_unarmed,assaults_no_injury_total,month_code
0,AK00101,anchorage,alaska,AK,12,2013,january,2013-01-01,AK0010100,2,...,0,0,0,0,1,1,1,27,30,1
1,AK00101,anchorage,alaska,AK,12,2013,february,2013-02-01,AK0010100,2,...,0,1,1,2,0,0,9,13,22,2
2,AK00101,anchorage,alaska,AK,12,2013,march,2013-03-01,AK0010100,2,...,0,1,2,3,0,0,6,19,25,3
3,AK00101,anchorage,alaska,AK,12,2013,april,2013-04-01,AK0010100,2,...,0,0,0,0,2,0,2,15,19,4
4,AK00101,anchorage,alaska,AK,12,2013,may,2013-05-01,AK0010100,2,...,0,0,0,0,0,0,6,17,23,5
5,AK00101,anchorage,alaska,AK,12,2013,june,2013-06-01,AK0010100,2,...,0,0,0,0,2,2,3,17,24,6
6,AK00101,anchorage,alaska,AK,12,2013,july,2013-07-01,AK0010100,2,...,0,2,1,3,0,1,0,15,16,7
7,AK00101,anchorage,alaska,AK,12,2013,august,2013-08-01,AK0010100,2,...,0,0,0,0,0,0,3,13,16,8
8,AK00101,anchorage,alaska,AK,12,2013,september,2013-09-01,AK0010100,2,...,0,0,0,0,0,0,1,11,12,9
9,AK00101,anchorage,alaska,AK,12,2013,october,2013-10-01,AK0010100,2,...,0,0,3,3,2,0,3,14,19,10


In [6]:
#indicators for officers killed and officers ever assaulted
df['killed_indicator'] = np.where(df['officers_killed_total'] > 0, 1, 0)

df['total_assaults'] = df['assaults_no_injury_total'] + df['assaults_with_injury_total']
officer_assaulted = (df['total_assaults'] > 0)

In [9]:
#restrict sample to officers ever assaulted
ori_to_keep = df.loc[officer_assaulted, 'ori'].unique()
df = df[df['ori'].isin(ori_to_keep)]
df.shape
df.columns

(532356, 61)

In [17]:
#add sizes for all municipalities according to first observation per municipality
df['first_pop'] = df.groupby('ori')['population'].transform('first')
df['first_employment'] = df.groupby('ori')['total_employees_officers'].transform('first')

In [18]:
#column transformations

#drop 0 pop rows to avoid division by 0
df = df.loc[df['first_pop'] > 0]

#force size mechanism construction
median_condition = df['first_employment'] > df['first_employment'].median()
trueval, falseval = 1, 0
df['employment_median_indicator'] = np.where(median_condition, trueval, falseval)

#proportion mechanism
df['employment_pop_proportion'] = df['first_employment']/df['first_pop']
prop_median_condition = df['employment_pop_proportion'] > df['employment_pop_proportion'].median()
df['employment_pop_prop_indicator'] = np.where(prop_median_condition, trueval, falseval)

In [19]:
df = df.sort_values(by=['ori', 'month_code'])

In [21]:
#create post and pre indicators (for both shooting and implicitly killing)
for i in range(10):
    df[f'PreviousTreatment_{i}'] = df.groupby('ori')['total_assaults'].shift(i)
    df[f'post_{i}'] = df[f'PreviousTreatment_{i}'] > 0

    #don't do the idxmin thing, shouldn't matter since things are NaN where edge case shifts; drop the NaNs by removing the PreviousTreament_i columns

    df[f'post_{i}'] = df[f'post_{i}'].astype(int)
    df = df.drop(columns=[f'PreviousTreatment_{i}'])

In [22]:
#make three pre-treatment month indicators (checking pre-trends)
for i in range(1, 4, 1):
    df[f'FutureTreatment_{i}'] = df.groupby('ori')['total_assaults'].shift(-1*i)
    df[f'pre_{i}'] = df[f'FutureTreatment_{i}'] > 0
    df[f'pre_{i}'] = df[f'pre_{i}'].astype(int)
    df = df.drop(columns=[f'FutureTreatment_{i}'])

In [25]:
#import UCR (uniform crime reporting) files for data on civilian arrests

ucrs = ['ucr_arrests_monthly_index_1974_2018_dta/ucr_arrests_monthly_index_crimes_age_2017.dta',
        'ucr_arrests_monthly_index_1974_2018_dta/ucr_arrests_monthly_index_crimes_age_2016.dta',
        'ucr_arrests_monthly_index_1974_2018_dta/ucr_arrests_monthly_index_crimes_age_2015.dta',
        'ucr_arrests_monthly_index_1974_2018_dta/ucr_arrests_monthly_index_crimes_age_2014.dta',
        'ucr_arrests_monthly_index_1974_2018_dta/ucr_arrests_monthly_index_crimes_age_2013.dta']

ucr_17, ucr_16, ucr_15, ucr_14, ucr_13 = pd.read_stata(ucrs[0]), pd.read_stata(ucrs[1]), pd.read_stata(ucrs[2]), pd.read_stata(ucrs[3]), pd.read_stata(ucrs[4])

In [26]:
ucr_dfs = [ucr_13, ucr_14, ucr_15, ucr_16, ucr_17]
ucr = pd.concat(ucr_dfs, ignore_index=True)

ucr['total_arrests'] = ucr['theft_tot_arrests'] + ucr['robbery_tot_arrests'] + ucr['rape_tot_arrests'] + ucr['murder_tot_arrests'] + ucr['mtr_veh_theft_tot_arrests'] + ucr['burglary_tot_arrests'] + ucr['arson_tot_arrests'] + ucr['agg_assault_tot_arrests']
ucr['log_total_arrests'] = np.log(ucr['total_arrests'] + 1)

  ucr['total_arrests'] = ucr['theft_tot_arrests'] + ucr['robbery_tot_arrests'] + ucr['rape_tot_arrests'] + ucr['murder_tot_arrests'] + ucr['mtr_veh_theft_tot_arrests'] + ucr['burglary_tot_arrests'] + ucr['arson_tot_arrests'] + ucr['agg_assault_tot_arrests']
  result = getattr(ufunc, method)(*inputs, **kwargs)
  ucr['log_total_arrests'] = np.log(ucr['total_arrests'] + 1)


In [27]:
ucr['month_code'] = ucr['month'].map(month_map) + (ucr['year'] - 2013)*12
df = df.drop_duplicates(subset=['ori', 'month_code'])
ucr = ucr.drop_duplicates(subset=['ori', 'month_code'])

  ucr['month_code'] = ucr['month'].map(month_map) + (ucr['year'] - 2013)*12


In [28]:
ucr.shape
merged = pd.merge(df, ucr, on=['ori', 'month_code', 'year'], how='left')
df.shape, ucr.shape, merged.shape

((465876, 80), (846711, 632), (465876, 709))

In [29]:
#data to run first two stage regressions (no time horizon; 3-month pre-trended event study)
merged.to_stata('event_study.dta')

In [30]:
#dataset creation for stage 3
overall_conditions = [(merged['post_0'] + merged['post_1'] + merged['post_2'] + merged['post_4'] + merged['post_5'] + merged['post_6'] +merged['post_7'] 
                       + merged['post_8'] + merged['post_9'] >= 1),
                      (merged['pre_1'] + merged['pre_2'] + merged['pre_3'] >= 1)]
overall_condition_values = [1, 0] #2 is default value
merged['post_overall'] = np.select(overall_conditions, overall_condition_values, default=2)

In [31]:
avg_data = merged.loc[merged['post_overall'] < 2]

In [32]:
merged.shape, avg_data.shape

((465876, 710), (270223, 710))

In [33]:
avg_data.to_stata('DDD.dta')