In [175]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [176]:
df = pd.read_csv('/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/2000-2016/allegations.csv', low_memory=False)

In [177]:
df.columns

Index(['CRID', 'OfficerID', 'OfficeFirst', 'OfficerLast', 'AllegationCode',
       'Category', 'Allegation', 'RecommendedFinding', 'RecommendedOutcome',
       'FinalFinding', 'FinalOutcome', 'Finding', 'Outcome', 'Beat',
       'Location', 'Add1', 'Add2', 'City', 'IncidentDate', 'StartDate',
       'EndDate', 'InvestigatorName', 'InvestigatorRank', 'Latitude',
       'Longitude'],
      dtype='object')

In [178]:
df.shape

(138677, 25)

In [179]:
len(pd.unique(df['CRID']))

75017

In [180]:
# add beat count column
df["Beat Count"] = 0

In [181]:
# if beat is nan, drop
df = df.dropna(subset=['Beat'])

In [182]:
# Drop all allegations that are not Use Of Force
df = df[df['Category'] == 'Use Of Force']

In [183]:
# Count how many times each beat appears throughout the dataset, and add that count to the Beat Count column, for each row, matching the beat
# use value counts on beat column
beat_counts = df['Beat'].value_counts()
beat_counts

531.0     302
1134.0    301
713.0     290
1822.0    262
3100.0    251
         ... 
235.0       4
1221.0      3
1655.0      2
1653.0      2
1935.0      1
Name: Beat, Length: 274, dtype: int64

In [184]:
# match beat to beat count
for index, row in df.iterrows():
    df.loc[index, 'Beat Count'] = beat_counts[row['Beat']]

In [185]:
df.head()

Unnamed: 0,CRID,OfficerID,OfficeFirst,OfficerLast,AllegationCode,Category,Allegation,RecommendedFinding,RecommendedOutcome,FinalFinding,...,Add2,City,IncidentDate,StartDate,EndDate,InvestigatorName,InvestigatorRank,Latitude,Longitude,Beat Count
8797,259002,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.66449,50
8798,259002,19347,Kenneth,Molesky,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.66449,50
8804,259011,10864,Mark,Grohovena,05D,Use Of Force,Excessive Force / Off Duty - No Injury,NS,600.0,NS,...,W 43RD PLACE,CHICAGO IL 60609,2000-01-01 00:00:00,2000-01-01,2000-12-11,,,41.815088,-87.64315,142
8807,259013,20651,James,Norwood,05C,Use Of Force,Excessive Force / Off Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,105
8808,259013,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,105


In [186]:
# calculate proportion of beat count to total number of allegations
# normalize beat count with sum of all beat counts
df['Beat Count'] = df['Beat Count'] / df['Beat Count'].sum()

In [187]:
df.head()

Unnamed: 0,CRID,OfficerID,OfficeFirst,OfficerLast,AllegationCode,Category,Allegation,RecommendedFinding,RecommendedOutcome,FinalFinding,...,Add2,City,IncidentDate,StartDate,EndDate,InvestigatorName,InvestigatorRank,Latitude,Longitude,Beat Count
8797,259002,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.66449,1.8e-05
8798,259002,19347,Kenneth,Molesky,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.66449,1.8e-05
8804,259011,10864,Mark,Grohovena,05D,Use Of Force,Excessive Force / Off Duty - No Injury,NS,600.0,NS,...,W 43RD PLACE,CHICAGO IL 60609,2000-01-01 00:00:00,2000-01-01,2000-12-11,,,41.815088,-87.64315,5.1e-05
8807,259013,20651,James,Norwood,05C,Use Of Force,Excessive Force / Off Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,3.7e-05
8808,259013,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,3.7e-05


In [188]:
# get list of unique officers in the dataset
officer_list = pd.unique(df['OfficerID'])

In [189]:
# permutation test
# probabilities are in df["Beat Count"]
officers = pd.read_csv('/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/2000-2016/officer_profile.csv', low_memory=False)

In [190]:
# map beat of offcers in officer_list to officers dataframe
officers['Beat'] = 0
for index, row in officers.iterrows():
    if row['OfficerID'] in officer_list:
        officers.loc[index, 'Beat'] = df.loc[df['OfficerID'] == row['OfficerID'], 'Beat'].iloc[0]

In [191]:
# if beat is 0, drop
officers = officers[officers['Beat'] != 0]

In [192]:
officers.head()

Unnamed: 0,OfficerID,OfficerFirst,OfficerLast,Gender,Race,ApptDate,Unit,Rank,Star,Age,Beat
1,13,Dale,Abbott,M,White,30284.0,,Police Officer,,69,1655
4,34,Floyd,Abron,M,Black,35975.0,,Police Officer,,49,1134
5,38,Abdalla,Abuzanat,M,Asian/Pacific,35975.0,,Po As Detective,,51,813
6,44,Marco,Acevedo,M,Hispanic,34947.0,,Po As Detective,,53,1433
7,58,Manuel,Acevedo,M,Hispanic,30151.0,,Police Officer,,66,1023


In [195]:
new_df = pd.DataFrame(columns=['OfficerID', 'Beat', 'Beat Count'])

In [196]:
# permutation test
# probabilities are in df["Beat Count"]
i = 0
while i < len(df) - 1:
    # using probabilities in df["Beat Count"], randomly select a beat
    beat = np.random.choice(df['Beat'], p=df['Beat Count'])
    # get all officers in that beat
    officers_in_beat = officers[officers['Beat'] == beat]
    # choose a random officer from that beat
    officer = np.random.choice(officers_in_beat['OfficerID'])
    # add officer to new dataframe, use concat
    new_df = pd.concat([new_df, officers[officers['OfficerID'] == officer]])
    # if the CRID is the same for next how many rows, add those officers to the new dataframe
    if df.iloc[i]['CRID'] == df.iloc[i+1]['CRID']:
        # choose an additional officer from the beat excluding the first officer
        officers_in_beat = officers_in_beat[officers_in_beat['OfficerID'] != officer]
        officer = np.random.choice(officers_in_beat['OfficerID'])
        # add officer to new dataframe
        new_df = pd.concat([new_df, officers[officers['OfficerID'] == officer]])
        i += 1
    i += 1
    
    

In [202]:
pd.unique(df["Beat"]).shape

(274,)

In [203]:
# get number of beats in value counts
vc = df['Beat'].value_counts()
len(vc)

274

In [204]:
len(df), len(new_df)

(22852, 182025)

In [205]:
# plot histogram value_counts of old df['Beat], expand graph to see all beats
# use plotly.express
fig = px.histogram(df, x="Beat", nbins=len(vc))
fig.show()

In [206]:
# plot new_df same as above
fig = px.histogram(new_df, x="Beat", nbins=len(vc))
fig.show()