In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
df = pd.read_csv('/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/2000-2016/allegations.csv', low_memory=False)

In [4]:
df.columns

Index(['CRID', 'OfficerID', 'OfficeFirst', 'OfficerLast', 'AllegationCode',
       'Category', 'Allegation', 'RecommendedFinding', 'RecommendedOutcome',
       'FinalFinding', 'FinalOutcome', 'Finding', 'Outcome', 'Beat',
       'Location', 'Add1', 'Add2', 'City', 'IncidentDate', 'StartDate',
       'EndDate', 'InvestigatorName', 'InvestigatorRank', 'Latitude',
       'Longitude'],
      dtype='object')

In [5]:
df.shape

(138677, 25)

In [6]:
len(pd.unique(df['CRID']))

75017

In [7]:
# add beat count column
df["Beat Count"] = 0

In [8]:
# if beat is nan, drop
df = df.dropna(subset=['Beat'])

In [9]:
# Drop all allegations that are not Use Of Force
df = df[df['Category'] == 'Use Of Force']

In [10]:
# Count how many times each beat appears throughout the dataset, and add that count to the Beat Count column, for each row, matching the beat
# use value counts on beat column
beat_counts = df['Beat'].value_counts()
beat_counts

531.0     302
1134.0    301
713.0     290
1822.0    262
3100.0    251
         ... 
235.0       4
1221.0      3
1655.0      2
1653.0      2
1935.0      1
Name: Beat, Length: 274, dtype: int64

In [11]:
# match beat to beat count
for index, row in df.iterrows():
    df.loc[index, 'Beat Count'] = beat_counts[row['Beat']]

In [12]:
df.head()

Unnamed: 0,CRID,OfficerID,OfficeFirst,OfficerLast,AllegationCode,Category,Allegation,RecommendedFinding,RecommendedOutcome,FinalFinding,...,Add2,City,IncidentDate,StartDate,EndDate,InvestigatorName,InvestigatorRank,Latitude,Longitude,Beat Count
8797,259002,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.66449,50
8798,259002,19347,Kenneth,Molesky,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.66449,50
8804,259011,10864,Mark,Grohovena,05D,Use Of Force,Excessive Force / Off Duty - No Injury,NS,600.0,NS,...,W 43RD PLACE,CHICAGO IL 60609,2000-01-01 00:00:00,2000-01-01,2000-12-11,,,41.815088,-87.64315,142
8807,259013,20651,James,Norwood,05C,Use Of Force,Excessive Force / Off Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,105
8808,259013,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,105


In [13]:
# calculate proportion of beat count to total number of allegations
# normalize beat count with sum of all beat counts
df['Beat Count'] = df['Beat Count'] / df['Beat Count'].sum()

In [14]:
df.head()

Unnamed: 0,CRID,OfficerID,OfficeFirst,OfficerLast,AllegationCode,Category,Allegation,RecommendedFinding,RecommendedOutcome,FinalFinding,...,Add2,City,IncidentDate,StartDate,EndDate,InvestigatorName,InvestigatorRank,Latitude,Longitude,Beat Count
8797,259002,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.66449,1.8e-05
8798,259002,19347,Kenneth,Molesky,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.66449,1.8e-05
8804,259011,10864,Mark,Grohovena,05D,Use Of Force,Excessive Force / Off Duty - No Injury,NS,600.0,NS,...,W 43RD PLACE,CHICAGO IL 60609,2000-01-01 00:00:00,2000-01-01,2000-12-11,,,41.815088,-87.64315,5.1e-05
8807,259013,20651,James,Norwood,05C,Use Of Force,Excessive Force / Off Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,3.7e-05
8808,259013,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,3.7e-05


In [15]:
# get list of unique officers in the dataset
officer_list = pd.unique(df['OfficerID'])

In [16]:
# group by CRID
df_grouped = df.groupby('CRID')

In [17]:
df_grouped.head()

Unnamed: 0,CRID,OfficerID,OfficeFirst,OfficerLast,AllegationCode,Category,Allegation,RecommendedFinding,RecommendedOutcome,FinalFinding,...,Add2,City,IncidentDate,StartDate,EndDate,InvestigatorName,InvestigatorRank,Latitude,Longitude,Beat Count
8797,259002,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.664490,0.000018
8798,259002,19347,Kenneth,Molesky,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,S MARSHFIELD,CHICAGO IL,2000-01-01 00:00:00,2000-01-01,2001-02-09,,,41.734875,-87.664490,0.000018
8804,259011,10864,Mark,Grohovena,05D,Use Of Force,Excessive Force / Off Duty - No Injury,NS,600.0,NS,...,W 43RD PLACE,CHICAGO IL 60609,2000-01-01 00:00:00,2000-01-01,2000-12-11,,,41.815088,-87.643150,0.000051
8807,259013,20651,James,Norwood,05C,Use Of Force,Excessive Force / Off Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,0.000037
8808,259013,3055,Cornelius,Brown,05A,Use Of Force,Excessive Force / On Duty - Injury,NS,600.0,NS,...,W 105TH,CHICAGO IL 60643,2000-01-01 00:00:00,2000-01-01,2001-02-01,,,41.703681,-87.625984,0.000037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138652,1080707,403,Regan,Allen,05A,Use Of Force,Excessive Force / On Duty - Injury,,,,...,S JEFFERY BLVD,CHICAGO IL,2016-04-16 00:00:00,2016-05-27,2016-09-30,,,41.771969,-87.576156,0.000038
138657,1081176,9597,Julio,Garcia,S021,Use Of Force,Miscellaneous,,,UN,...,East 71ST ST,CHICAGO ILLINOIS 60649,2016-06-28 00:00:00,2016-06-28,2017-03-03,,,41.766444,-87.575869,0.000038
138658,1080321,2425,Stephen,Borozan,S021,Use Of Force,Miscellaneous,,,UN,...,South MICHIGAN AVE,CHICAGO ILLINOIS 60628,2016-04-27 00:00:00,2016-04-29,2017-04-11,Alexis L. Serio,,41.688973,-87.620994,0.000108
138671,1079449,21483,Joseph,Oskvarek,05A,Use Of Force,Excessive Force / On Duty - Injury,,,UN,...,,,2016-01-10 00:00:00,2016-03-01,,,,,,0.000029


In [18]:
# permutation test
# probabilities are in df["Beat Count"]
officers = pd.read_csv('/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/2000-2016/officer_profile.csv', low_memory=False)

In [19]:
officers.head()

Unnamed: 0,OfficerID,OfficerFirst,OfficerLast,Gender,Race,ApptDate,Unit,Rank,Star,Age
0,4,Carmel,Abbate,M,White,25209.0,,Po As Detective,,80
1,13,Dale,Abbott,M,White,30284.0,,Police Officer,,69
2,17,Moulay,Abdullah,M,Black,32868.0,,Police Officer,,64
3,19,Restituto,Abejero,M,Asian/Pacific,36507.0,,Police Officer,,73
4,34,Floyd,Abron,M,Black,35975.0,,Police Officer,,49


In [20]:
# map beat of offcers in officer_list to officers dataframe
officers['Beat'] = 0
for index, row in officers.iterrows():
    if row['OfficerID'] in officer_list:
        officers.loc[index, 'Beat'] = df.loc[df['OfficerID'] == row['OfficerID'], 'Beat'].iloc[0]

In [21]:
# if beat is 0, drop
officers = officers[officers['Beat'] != 0]

In [22]:
officers.to_csv('/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Data/Processed/2000-2016/officers_plus_beat.csv')

In [23]:
officers["Probability"] = 0

In [24]:
officers.head()

Unnamed: 0,OfficerID,OfficerFirst,OfficerLast,Gender,Race,ApptDate,Unit,Rank,Star,Age,Beat,Probability
1,13,Dale,Abbott,M,White,30284.0,,Police Officer,,69,1655,0
4,34,Floyd,Abron,M,Black,35975.0,,Police Officer,,49,1134,0
5,38,Abdalla,Abuzanat,M,Asian/Pacific,35975.0,,Po As Detective,,51,813,0
6,44,Marco,Acevedo,M,Hispanic,34947.0,,Po As Detective,,53,1433,0
7,58,Manuel,Acevedo,M,Hispanic,30151.0,,Police Officer,,66,1023,0


In [25]:
# groupby beat
officers_grouped = officers.groupby('Beat')

In [26]:
# make probabilities add to one in each group
for name, group in officers_grouped:
    officers.loc[officers['Beat'] == name, 'Probability'] = 1 / len(group)

In [27]:
officers_grouped.head()

Unnamed: 0,OfficerID,OfficerFirst,OfficerLast,Gender,Race,ApptDate,Unit,Rank,Star,Age,Beat,Probability
1,13,Dale,Abbott,M,White,30284.0,,Police Officer,,69,1655,0.333333
4,34,Floyd,Abron,M,Black,35975.0,,Police Officer,,49,1134,0.001513
5,38,Abdalla,Abuzanat,M,Asian/Pacific,35975.0,,Po As Detective,,51,813,0.004785
6,44,Marco,Acevedo,M,Hispanic,34947.0,,Po As Detective,,53,1433,0.007463
7,58,Manuel,Acevedo,M,Hispanic,30151.0,,Police Officer,,66,1023,0.007874
...,...,...,...,...,...,...,...,...,...,...,...,...
52747,20866,Daniel,O Connor,M,White,34583.0,,Police Officer,,58,1655,0.333333
53748,29258,Nigel,Valentine,M,Black,39202.0,,Police Officer,,43,1225,0.250000
64848,29258,Nigel,Valentine,M,Black,39202.0,,Police Officer,,43,1225,0.250000
66610,12256,Randall,Hiller,M,White,31768.0,,Sergeant Of Police,,62,1653,0.166667


In [204]:
new_df = pd.DataFrame(columns=['OfficerID', 'Beat', 'Occurences', 'Probability'])
# Polya Urn Model, Rich get Richer
alpha = 0
for index, row in df_grouped:
    # get num officers in group
    num_officers = len(row)
    # get beat of group
    beat = row['Beat'].iloc[0]
    # Get num_officers number of officers from officers_grouped
    if beat not in officers_grouped.groups:
        continue
    probs = officers_grouped.get_group(beat)['Probability']
    # print(probs)
    officers_in_group = officers_grouped.get_group(beat).sample(n=num_officers, replace=False, weights=probs)
    # add officers_in_group and their 'OfficerID', 'Beat', 'Probability' to new_df and increment 'Occurences' by 1, and update probabilities by adding 0.001
    for index2, row2 in officers_in_group.iterrows():
        if row2['OfficerID'] not in new_df['OfficerID'].values:
            new_df = pd.concat([new_df, pd.DataFrame([[row2['OfficerID'], row2['Beat'], 1, row2['Probability']]], columns=['OfficerID', 'Beat', 'Occurences', 'Probability'])])
        else:
            new_df.loc[new_df['OfficerID'] == row2['OfficerID'], 'Occurences'] += 1
            new_df.loc[new_df['OfficerID'] == row2['OfficerID'], 'Probability'] += alpha
            # update probabilities in officers_grouped
            officers.loc[officers['OfficerID'] == row2['OfficerID'], 'Probability'] += alpha

In [205]:
# get top 10 rows of highest probability in officers dataframe
top_10 = officers.sort_values(by=['Probability'], ascending=False).head(10)
top_10

Unnamed: 0,OfficerID,OfficerFirst,OfficerLast,Gender,Race,ApptDate,Unit,Rank,Star,Age,Beat,Probability
52519,18862,Nyls,Meredith,M,Black,39272.0,,Po Asgn Evid. Techni,,45,531,6185.950749
56508,18862,Nyls,Meredith,M,Black,39272.0,,Po Asgn Evid. Techni,,45,531,6185.950749
48069,18862,Nyls,Meredith,M,Black,39272.0,,Po Asgn Evid. Techni,,45,531,6185.950749
63763,18862,Nyls,Meredith,M,Black,39272.0,,Po Asgn Evid. Techni,,45,531,6185.950749
60209,18862,Nyls,Meredith,M,Black,39272.0,,Po Asgn Evid. Techni,,45,531,6185.950749
43319,18862,Nyls,Meredith,M,Black,39272.0,,Po Asgn Evid. Techni,,45,531,6185.950749
57374,26267,Derrick,Shinn,M,Black,31733.0,,Sergeant Of Police,,61,713,6135.920715
53431,26267,Derrick,Shinn,M,Black,31733.0,,Sergeant Of Police,,61,713,6135.920715
25936,26267,Derrick,Shinn,M,Black,31733.0,,Sergeant Of Police,,61,713,6135.920715
39437,26267,Derrick,Shinn,M,Black,31733.0,,Sergeant Of Police,,61,713,6135.920715


In [206]:
# get bottom 10 rows of lowest probability in officers dataframe
bottom_10 = officers.sort_values(by=['Probability'], ascending=True).head(10)
bottom_10

Unnamed: 0,OfficerID,OfficerFirst,OfficerLast,Gender,Race,ApptDate,Unit,Rank,Star,Age,Beat,Probability
24110,12509,Brian,Hood,M,Black,35618.0,,Police Officer,,59,531,0.00149
50134,32303,Scott,Rotkvich,M,White,31607.0,,Sergeant Of Police,,57,531,0.00149
38673,21548,Milton,Owens,M,Black,31880.0,,Sergeant Of Police,,63,531,0.00149
37299,12741,Robert,Hughes,M,Black,36976.0,,Police Officer,,50,531,0.00149
23903,11105,Kevin,Gyrion,M,White,31880.0,,Lieutenant Of Police,,59,531,0.00149
4864,28051,Linda,Szefc,F,White,32868.0,,Sergeant Of Police,,64,531,0.00149
41084,3549,Joseph,Byrne,M,White,39321.0,,Police Officer,,40,531,0.00149
44072,23936,Steven,Ridgner,M,Black,32868.0,,Police Officer,,70,531,0.00149
44078,23954,Damone,Riggins,M,Black,36809.0,,Police Officer,,62,531,0.00149
9477,21548,Milton,Owens,M,Black,31880.0,,Sergeant Of Police,,63,531,0.00149


In [207]:
# value counts
new_df_vc = new_df['Occurences'].value_counts()
new_df_vc

1      81
2      19
5       8
45      7
7       7
       ..
55      1
126     1
161     1
83      1
10      1
Name: Occurences, Length: 142, dtype: int64

In [208]:
new_df_vc.index.name = 'Number of Allegations'
new_df_vc.name = 'Number of Officers'
# sort new_df_vc by ascending index
new_df_vc = new_df_vc.sort_index(ascending=True)
new_df_vc

Number of Allegations
1      81
2      19
3       5
4       5
5       8
       ..
249     1
258     1
264     1
290     1
293     1
Name: Number of Officers, Length: 142, dtype: int64

In [209]:
original = pd.read_csv("/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Code/Undergraduate-Thesis/Tests/Results/original.csv")
perm = pd.read_csv("/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Code/Undergraduate-Thesis/Tests/Results/permutation_test.csv")

In [210]:
# Plot original, perm, and new_df_vc using px
# create empty fig
fig = px.line(title="Number of Allegations vs Number of Officers (Original, Permutation Test, Polya Urn Model)")
# add name to original
fig.add_scatter(x=original['Number of Allegations'], y=original['Number of Officers'], name='Original')
fig.add_scatter(x=perm['Number of Allegations'], y=perm['Number of Officers'], mode='lines', name='Permutation Test')
fig.add_scatter(x=new_df_vc.index, y=new_df_vc, mode='lines', name='Polya Urn Model')
# add x and y axis titles
fig.update_xaxes(title_text='Number of Allegations')
fig.update_yaxes(title_text='Number of Officers')
fig.show()