In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import time
import numpy as np
import plotly.graph_objs as go

# df = pd.read_csv(
#     '/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Code/Undergraduate-Thesis/Preprocessing_Code/final/data/proportions.csv', low_memory=False)
# officers = pd.read_csv(
#     '/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Code/Undergraduate-Thesis/Preprocessing_Code/final/data/perm_unique_officers.csv', low_memory=False)

df = pd.read_csv('/content/gdrive/MyDrive/Thesis_Data/proportions.csv')
officers = pd.read_csv('/content/gdrive/MyDrive/Thesis_Data/perm_unique_officers.csv')

In [None]:
print(df.columns)
print(df.head())
print(officers.columns)

Index(['CRID', 'OfficerID', 'OfficeFirst', 'OfficerLast', 'Category',
       'Allegation', 'Beat', 'IncidentDate', 'Beat Count', 'Beat Proportion'],
      dtype='object')
     CRID  OfficerID OfficeFirst OfficerLast      Category  \
0  259002       3055   Cornelius       Brown  Use Of Force   
1  259002      19347     Kenneth     Molesky  Use Of Force   
2  259011      10864        Mark   Grohovena  Use Of Force   
3  259013      20651       James     Norwood  Use Of Force   
4  259013       3055   Cornelius       Brown  Use Of Force   

                               Allegation    Beat         IncidentDate  \
0      Excessive Force / On Duty - Injury  2221.0  2000-01-01 00:00:00   
1      Excessive Force / On Duty - Injury  2221.0  2000-01-01 00:00:00   
2  Excessive Force / Off Duty - No Injury   925.0  2000-01-01 00:00:00   
3     Excessive Force / Off Duty - Injury  2232.0  2000-01-01 00:00:00   
4      Excessive Force / On Duty - Injury  2232.0  2000-01-01 00:00:00   

   Beat Cou

In [None]:
# choose only OfficerID and Beat columns
prep_df = df[['OfficerID', 'Beat', 'Beat Count', 'Beat Proportion']]
prep_officers = officers[['OfficerID', 'Beat']]
# group by CRID
df_grouped = df[['CRID', 'OfficerID', 'Beat', 'Beat Count', 'Beat Proportion']].groupby('CRID')
# make df_grouped into a numpy array
np_grouped = df_grouped.apply(lambda x: x.to_numpy())
# make np_grouped into a numpy array
np_grouped = np_grouped.to_numpy()

In [None]:
# make df into numpy
np_df = prep_df.to_numpy()
np_officers = prep_officers.to_numpy()
# make array dtype object float
np_df = np_df.astype(float)
np_officers = np_officers.astype(float)

In [None]:
# Column 0 is OfficerID, Column 1 is Beat
# ----------- Start of Original -----------
# get value counts of OfficerID
officer_counts = np.unique(np_df[:, 0], return_counts=True)
# get number of times each value appears in officer_counts
# sort in descending order
officer_counts = np.unique(officer_counts[1], return_counts=True)
# if number of complaints exceed 20, make all of the counts into a single bin
# make a copy of the original array to save for reference
officer_counts_copy = np.copy(officer_counts)
print(officer_counts)
# this is to prevent the graph from being too stretched
for i in range(len(officer_counts[0])):
    if officer_counts[0][i] > 10:
        officer_counts[1][9] += officer_counts[1][i]
        print(officer_counts[1][i])
        officer_counts[1][i] = 0
officer_counts = np.delete(officer_counts, np.s_[10:], axis=1)
# plot density curve
print(officer_counts)
fig = go.Figure()
fig.add_trace(go.Scatter(x=officer_counts[0], y=officer_counts[1], mode='lines', name='Data'))
fig.update_layout(title_text='Original Data')
fig.show()

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 29, 30, 31, 35]), array([3593, 1734, 1006,  658,  404,  264,  193,  119,  111,   57,   47,
         41,   26,   13,   13,   12,    9,    4,    7,    5,    2,    1,
          1,    1,    1,    2,    1,    1,    1,    1]))
47
41
26
13
13
12
9
4
7
5
2
1
1
1
1
2
1
1
1
1
[[   1    2    3    4    5    6    7    8    9   10]
 [3593 1734 1006  658  404  264  193  119  111  246]]


In [None]:
# print sum of counts
print('Sum of Counts: ', np.sum(officer_counts[1]))

Sum of Counts:  8328


In [None]:
num_simulations = 500

In [None]:
# beta is the spillover proportion of which an officer's weight is increased based on other officers involved in the same incident
beta_values = np.arange(0.3, 0.55, 0.05)
best_mse = np.inf
best_urn_avg_complaints_dict = {}
best_beta = 0
alpha = 0.20

In [None]:
# ----------- Start of Spillover Polya Urn Model Finding Optimal Alpha -----------
start_time = time.time()
for beta in beta_values:
    print("Current:", beta, "Best:", best_beta, "Best MSE:", best_mse)
    weighted_np_officers = np_officers
    # add 0 column to weighted_np_officers
    weighted_np_officers = np.insert(weighted_np_officers, 2, 0, axis=1)
    weighted_np_officers = np.insert(weighted_np_officers, 3, 1, axis=1)
    # sort weighted_np_officers by values in second column
    weighted_np_officers = weighted_np_officers[weighted_np_officers[:, 1].argsort()]
    # for each unique value in the second column, get the number of times it appears
    unique_beats, beat_counts = np.unique(weighted_np_officers[:, 1], return_counts=True)
    # for each unique beat, make the sum of the third column equal to 1
    for beat in unique_beats:
        # get indices of beat
        beat_indices = np.where(weighted_np_officers[:, 1] == beat)
        # get number of officers in beat
        num_officers = beat_counts[np.where(unique_beats == beat)]
        # get weights for officers in beat
        weighted_np_officers[beat_indices, 2] = 1 / num_officers

    polya_urn_results = []
    for i in range(num_simulations):
        # print(i)
        # make 1d array of 0s, length of np_officers
        if i % 100 == 0:
          print(f"{beta}: {i}")
        polya_urn = np.array([])
        temp_weighted_np_officers = weighted_np_officers.copy()
        # for each element in np_grouped, get length of each individual element
        for incident in np_grouped:
            # get length of each individual element
            num_officers = len(incident)
            # get beat of group
            beat = incident[0][2]
            if beat not in weighted_np_officers[:, 1]:
                continue
            # get officers in beat
            officers_in_beat = temp_weighted_np_officers[temp_weighted_np_officers[:, 1] == beat]
            if len(officers_in_beat) < num_officers:
                continue
            # randomly sample officers without replacement from officers in beat
            p = officers_in_beat[:, 2] / sum(officers_in_beat[:, 2])
            sample_officers = np.random.choice(officers_in_beat[:, 0], num_officers, replace=False, p=p)
            # add sample officers to polya_urn
            polya_urn = np.append(polya_urn, sample_officers)
            # print(sample_officers)
            # selected officers weights go up by alpha
            for officer in sample_officers:
                temp_weighted_np_officers[temp_weighted_np_officers[:, 0] == officer, 2] += alpha
                temp_weighted_np_officers[temp_weighted_np_officers[:, 0] == officer, 3] += 1
            # if officer allegations go above 35, set weight to 0
            for officer in sample_officers:
              if sample_officers.size == 1:
                  break
              other_officers = sample_officers[sample_officers != officer]
              for other_officer in other_officers:
                  temp_weighted_np_officers[temp_weighted_np_officers[:, 0] == other_officer, 2] = beta * temp_weighted_np_officers[temp_weighted_np_officers[:, 0] == other_officer, 2] + (1 - beta) / (len(sample_officers) - 1) * temp_weighted_np_officers[temp_weighted_np_officers[:, 0] == officer, 2]
        # get value counts of OfficerID in polya_urn
        polya_urn_counts = 0
        polya_urn_counts = np.unique(polya_urn, return_counts=True)
        # get number of times each value appears in polya_urn_counts
        # sort in descending order
        polya_urn_counts = np.unique(polya_urn_counts[1], return_counts=True)
        # add polya_urn_counts to polya_urn_results
        polya_urn_results.append(polya_urn_counts)
    polya_urn_results = pd.DataFrame(polya_urn_results)
    polya_urn_results.columns = ['Number of Complaints', 'Counts']
    # add 0 to each datapoint in Number of Complaints
    polya_urn_results['Number of Complaints'] = polya_urn_results['Number of Complaints'].apply(lambda x: np.concatenate(([0], x)))
    # and insert the np.sum(officer_counts[1]) - np.sum(polya_urn_results['Counts']) into the second column
    polya_urn_results['Counts'] = polya_urn_results['Counts'].apply(lambda x: np.concatenate(([np.sum(officer_counts[1]) - np.sum(x)], x)))
    # add 1 to all the counts to match
    polya_urn_results['Number of Complaints'] = polya_urn_results['Number of Complaints'].apply(lambda x: x + 1)
    # explode the number of complaints column, then get max value
    urn_max_complaints = max(polya_urn_results['Number of Complaints'].explode())
    # print(urn_max_complaints)
    # create dictionary of number of complaints and counts, key is number of complaints, value is counts
    urn_complaints_dict = {}
    for i in range(1, 11):
        urn_complaints_dict[i] = []
    # for each row in polya_urn_results, get the list of Number of Complaints, then get the list of Counts, then map them to the dictionary
    for index, row in polya_urn_results.iterrows():
        # get list of number of complaints
        complaints = row['Number of Complaints']
        # get list of counts
        counts = row['Counts']
        # map them to the dictionary
        for i in range(len(complaints)):
            if complaints[i] < 10:
                urn_complaints_dict[complaints[i]].append(counts[i])
            else:
                urn_complaints_dict[10].append(counts[i])

    # if the list length is not equal to num_simulations, then add 0s to the list until it is equal to num_simulations
    for key in urn_complaints_dict:
        if len(urn_complaints_dict[key]) != num_simulations:
            urn_complaints_dict[key] = urn_complaints_dict[key] + [0] * (num_simulations - len(urn_complaints_dict[key]))
    # get the average of each list in the dictionary
    urn_avg_complaints_dict = {}
    for key in urn_complaints_dict:
        urn_avg_complaints_dict[key] = np.mean(urn_complaints_dict[key])
    # compare the average number of complaints to the actual number of complaints, get the mse
    mse = 0
    print(officer_counts[1], mse)
    for key in urn_avg_complaints_dict:
        # print('Key', key)
        print(beta, urn_avg_complaints_dict[key], officer_counts[1][key - 1])
        mse += (urn_avg_complaints_dict[key] - officer_counts[1][key - 1]) ** 2
    mse = mse / len(urn_avg_complaints_dict)
    print(beta, mse)
    if mse < best_mse:
        best_mse = mse
        best_beta = beta
        best_urn_avg_complaints_dict = urn_avg_complaints_dict
end_time = time.time()
print(end_time - start_time)

Current: 0.3 Best: 0 Best MSE: inf
0.3: 0
0.3: 100
0.3: 200
0.3: 300
0.3: 400
[3593 1734 1006  658  404  264  193  119  111  246] 0
0.3 3071.73 3593
0.3 1283.194 1734
0.3 838.208 1006
0.3 656.242 658
0.3 534.274 404
0.3 434.216 264
0.3 347.298 193
0.3 277.174 119
0.3 218.912 111
0.3 37.432741971704466 246
0.3 65302.269804144176
Current: 0.35 Best: 0.3 Best MSE: 65302.269804144176
0.35: 0
0.35: 100
0.35: 200
0.35: 300
0.35: 400
[3593 1734 1006  658  404  264  193  119  111  246] 0
0.35 3200.614 3593
0.35 1214.672 1734
0.35 800.17 1006
0.35 635.12 658
0.35 519.846 404
0.35 425.004 264
0.35 345.028 193
0.35 277.262 119
0.35 218.966 111
0.35 38.3298957640275 246
0.35 60884.33143893797
Current: 0.39999999999999997 Best: 0.35 Best MSE: 60884.33143893797
0.39999999999999997: 0
0.39999999999999997: 100
0.39999999999999997: 200
0.39999999999999997: 300
0.39999999999999997: 400
[3593 1734 1006  658  404  264  193  119  111  246] 0
0.39999999999999997 3327.016 3593
0.39999999999999997 1152.502 17

In [None]:
polya_urn_results = pd.DataFrame(polya_urn_results)
polya_urn_results.columns = ['Number of Complaints', 'Counts']

In [None]:
# add 0 to each datapoint in Number of Complaints
polya_urn_results['Number of Complaints'] = polya_urn_results['Number of Complaints'].apply(lambda x: np.concatenate(([0], x)))
# and insert the np.sum(officer_counts[1]) - np.sum(polya_urn_results['Counts']) into the second column
polya_urn_results['Counts'] = polya_urn_results['Counts'].apply(lambda x: np.concatenate(([np.sum(officer_counts[1]) - np.sum(x)], x)))
# add 1 to all the counts to match
polya_urn_results['Number of Complaints'] = polya_urn_results['Number of Complaints'].apply(lambda x: x + 1)

In [None]:
# Remark, Data here is ascending, number of complaints seems like it gets capped, will need to look into this
polya_urn_results.head(50)

Unnamed: 0,Number of Complaints,Counts
0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3681, 993, 674, 530, 453, 403, 311, 283, 2..."
1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3669, 1010, 650, 548, 426, 382, 335, 269, ..."
2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3703, 987, 653, 507, 480, 379, 337, 278, 2..."
3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3730, 1013, 644, 528, 419, 390, 316, 271, ..."
4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3758, 941, 661, 531, 437, 377, 336, 269, 2..."
5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3672, 1007, 649, 554, 420, 399, 344, 281, ..."
6,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3701, 980, 628, 577, 446, 385, 325, 271, 2..."
7,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3703, 1043, 631, 540, 420, 370, 301, 260, ..."
8,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3714, 980, 682, 517, 437, 382, 316, 277, 2..."
9,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[0, 3724, 932, 677, 543, 489, 365, 307, 273, 2..."


In [None]:
# explode the number of complaints column, then get max value
urn_max_complaints = max(polya_urn_results['Number of Complaints'].explode())
print(urn_max_complaints)
# create dictionary of number of complaints and counts, key is number of complaints, value is counts
urn_complaints_dict = {}
for i in range(1, 11):
    urn_complaints_dict[i] = []
# for each row in polya_urn_results, get the list of Number of Complaints, then get the list of Counts, then map them to the dictionary
for index, row in polya_urn_results.iterrows():
    # get list of number of complaints
    complaints = row['Number of Complaints']
    # get list of counts
    counts = row['Counts']
    # map them to the dictionary
    for i in range(len(complaints)):
        if complaints[i] < 10:
            urn_complaints_dict[complaints[i]].append(counts[i])
        else:
            urn_complaints_dict[10].append(counts[i])

# if the list length is not equal to num_simulations, then add 0s to the list until it is equal to num_simulations
for key in urn_complaints_dict:
    if len(urn_complaints_dict[key]) != num_simulations:
        urn_complaints_dict[key] = urn_complaints_dict[key] + [0] * (num_simulations - len(urn_complaints_dict[key]))

51


In [None]:
# get the average of each list in the dictionary
urn_avg_complaints_dict = {}
for key in urn_complaints_dict:
    urn_avg_complaints_dict[key] = np.mean(urn_complaints_dict[key])

In [None]:
# get the 95% confidence interval of each list in the dictionary
upper_urn_ci_complaints_dict = {}
lower_urn_ci_complaints_dict = {}
for key in urn_complaints_dict:
    # 95% confidence interval using quantile
    upper_urn_ci_complaints_dict[key] = np.quantile(urn_complaints_dict[key], 0.975)
    lower_urn_ci_complaints_dict[key] = np.quantile(urn_complaints_dict[key], 0.025)

In [None]:
# plot curve
# Number of Complaints are on the x-axis, Counts are on the y-axis
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(urn_avg_complaints_dict.keys()), y=list(urn_avg_complaints_dict.values()), mode='lines', name='Polya Urn Model Results'))
# add line with color
fig.add_trace(go.Scatter(x=list(upper_urn_ci_complaints_dict.keys()), y=list(upper_urn_ci_complaints_dict.values()), mode='lines', name='Polya Urn Model 95% Confidence Interval', line=dict(color='rgb(66, 81, 245)', dash='dash')))
fig.add_trace(go.Scatter(x=list(lower_urn_ci_complaints_dict.keys()), y=list(lower_urn_ci_complaints_dict.values()), mode='lines', name='Polya Urn Model 95% Confidence Interval', line=dict(color='rgb(66, 81, 245)', dash='dash'), fill='tonexty', fillcolor='rgba(66, 81, 245,0.2)'))
fig.add_trace(go.Scatter(x=officer_counts[0], y=officer_counts[1], mode='lines', name='Data'))
fig.update_layout(title_text='Spillover Polya Urn Model Results')
fig.show()