In [1]:
import pandas as pd
import time
import plotly.express as px

df = pd.read_csv(
    '/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Code/Undergraduate-Thesis/Preprocessing_Code/final/data/proportions.csv', low_memory=False)
officers = pd.read_csv(
    '/Users/jeremyhudsonchan/Dropbox/Files/Boston_College_Courses/Thesis/Code/Undergraduate-Thesis/Preprocessing_Code/final/data/perm_unique_officers.csv', low_memory=False)

# group by CRID
df_grouped = df.groupby('CRID')

In [2]:
# ----------- Start of Polya Urn Model -----------
weighted_officers = officers

weighted_officers['Weights'] = 0
weighted_officers['Occurences'] = 1

# groupby beat
weighted_officers_grouped = weighted_officers.groupby('Beat')
# make weights sum to one in each group
for name, group in weighted_officers_grouped:
    weighted_officers.loc[weighted_officers['Beat'] ==
                          name, 'Weights'] = 1 / len(group)

weighted_df = pd.DataFrame(
    columns=['OfficerID', 'Beat'])

polya_urn_results = []
# Add runtime
start_time = time.time()
for i in range(3):
    alpha = 0.1
    perm_df = pd.DataFrame(columns=['Occurences', 'Frequencies'])
    print(i)
    temp_weighted_officers = weighted_officers
    ts = 0
    for index, row in df_grouped:
        # get num officers in group
        num_officers = len(row)
        # get beat of group
        beat = row['Beat'].iloc[0]
        if beat not in weighted_officers_grouped.groups:
            continue
        # polya urn model, rich get richer
        weights = temp_weighted_officers.loc[temp_weighted_officers['Beat']
                                             == beat, 'Weights']
        # if num_officers > officers in beat print error
        if num_officers > len(weights):
            # ts += 1
            # print("troubleshoot", ts)
            continue
        # sample officers from the beat using the weights in the Weights column
        officers_sample = weighted_officers_grouped.get_group(
            beat).sample(n=num_officers, replace=False, weights=weights)
        # add officers to new dataframe
        for index, row in officers_sample.iterrows():
            # pd concat
            weighted_df = pd.concat([weighted_df, pd.DataFrame(
                [[row['OfficerID'], row['Beat']]], columns=['OfficerID', 'Beat'])])
            # Update officer occurences in weighted_officers
            temp_weighted_officers.loc[temp_weighted_officers['OfficerID'] ==
                                       row['OfficerID'], 'Occurences'] += 1
            # update officer weights in weighted_officers
            temp_weighted_officers.loc[temp_weighted_officers['OfficerID'] ==
                                       row['OfficerID'], 'Weights'] += alpha
    # if number of occurences > 35, make them equal to 35
    temp_weighted_officers['Occurences'] = temp_weighted_officers['Occurences'].apply(
        lambda x: 35 if x > 35 else x)
    occurences = temp_weighted_officers['Occurences'].value_counts()
    # add occurences to perm_df
    occurences.index.name = 'Number of Allegations'
    occurences.name = 'Number of Officers'
    # save it to polya_urn_results
    polya_urn_results.append(occurences)
end_time = time.time()

0
1
2


In [3]:
print("Runtime: " + str(end_time - start_time))

Runtime: 151.51331114768982


In [4]:
print(polya_urn_results)

[Number of Allegations
1     4044
2     1034
3      704
4      501
5      368
6      266
7      238
9      174
8      166
10     136
11     112
12     103
13      80
14      57
15      51
16      44
17      31
18      29
19      25
20      24
21      24
22      17
24      16
35      15
25      11
23      11
30      10
28       9
27       8
33       5
29       4
32       4
31       4
26       2
34       1
Name: Number of Officers, dtype: int64, Number of Allegations
1     3368
2      839
3      560
4      421
5      388
6      298
7      277
8      213
35     204
9      179
10     168
11     144
12     142
13     117
17     100
16      96
15      92
14      90
19      78
18      75
20      67
23      53
21      50
24      47
22      40
25      33
28      30
26      30
30      26
32      23
27      20
29      18
33      18
31      17
34       7
Name: Number of Officers, dtype: int64, Number of Allegations
1     2990
2      774
35     462
3      460
4      394
5      323
6      278
7     

In [5]:
# 95% confidence interval for polya urn model
polya_conf_intv = pd.concat(polya_urn_results).groupby(level=0).quantile(
    [0.025, 0.975]).unstack()

In [6]:
# get average 
polya_df_vc = pd.concat(polya_urn_results).groupby(level=0).mean()

In [7]:
print(polya_df_vc.index)

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35],
           dtype='int64', name='Number of Allegations')


In [8]:
print(polya_df_vc.values)

[3467.33333333  882.33333333  574.66666667  438.66666667  359.66666667
  280.66666667  257.66666667  195.33333333  182.          158.
  143.33333333  131.33333333  104.66666667   94.66666667   85.66666667
   81.33333333   80.66666667   63.           59.66666667   55.66666667
   46.66666667   38.33333333   41.33333333   37.           35.33333333
   26.66666667   30.66666667   26.33333333   21.66666667   28.33333333
   20.66666667   18.66666667   19.66666667   13.33333333  227.        ]


In [11]:
# plot polya_df_vc, polya_conf_intv
fig = px.line(labels={
    'x': 'Number of Allegations', 'y': 'Number of Officers'})
fig.add_scatter(x=polya_df_vc.index, y=polya_df_vc.values,
                mode='lines', name='Polya Urn Model')
fig.add_scatter(x=polya_conf_intv.index, y=polya_conf_intv[0.025],
                mode='lines', name='Polya Urn Model 95% Confidence Interval')
fig.add_scatter(x=polya_conf_intv.index, y=polya_conf_intv[0.975],
                mode='lines', name='Polya Urn Model 95% Confidence Interval')
fig.update_layout(title_text='Polya Urn Model Results')
# add legend for each line
fig.update_layout(showlegend=True)
fig.show()