
# Analysing Police Activity

In [889]:
import pandas as pd


# Cleaning

In [890]:
df = pd.read_csv('hartford.csv')
print("The data has {} columns and {} rows.".format(df.shape[0], df.shape[1]))
display(df.head())

The data has 18439 columns and 26 rows.


Unnamed: 0,raw_row_number,date,time,location,lat,lng,district,subject_age,subject_race,subject_sex,...,outcome,contraband_found,search_conducted,search_vehicle,search_basis,reason_for_stop,raw_subject_race_code,raw_subject_ethnicity_code,raw_search_authorization_code,raw_intervention_disposition_code
0,1,2013-10-13,15:21:00,LINNMORE ST AT ROGER,41.732189,-72.69976,SOUTH WEST,38.0,hispanic,female,...,citation,,False,False,,Stop Sign,W,H,N,I
1,2,2013-10-24,01:12:00,HAWTHORN STREET @ IMLAY STREET,41.764621,-72.695362,ASYLUM HILL,20.0,black,male,...,citation,,False,False,,Defective Lights,B,N,N,I
2,3,2013-10-26,10:06:00,NEW PARK MIRRILL,41.747837,-72.712933,PARKVILLE,26.0,white,female,...,citation,,False,False,,Traffic Control Signal,W,N,N,I
3,4,2013-10-26,18:06:00,nfew park at merrill st,41.748069,-72.712641,PARKVILLE,26.0,white,female,...,citation,,False,False,,Traffic Control Signal,W,N,N,I
4,5,2013-10-26,19:56:00,SUMMIT ST AT ZION ST,41.753945,-72.693278,FROG HOLLOW,39.0,white,male,...,citation,,False,False,,Stop Sign,W,N,N,I


In [891]:


df.columns

Index(['raw_row_number', 'date', 'time', 'location', 'lat', 'lng', 'district',
       'subject_age', 'subject_race', 'subject_sex', 'officer_id_hash',
       'department_name', 'type', 'arrest_made', 'citation_issued',
       'search_vehicle', 'search_basis', 'reason_for_stop',
       'raw_subject_race_code', 'raw_subject_ethnicity_code',
       'raw_search_authorization_code', 'raw_intervention_disposition_code'],
      dtype='object')

In [892]:
df = df[['subject_sex', 'arrest_made', 'citation_issued', 'warning_issued', 'outcome', 'reason_for_stop', 'subject_age', 'search_conducted', 'officer_id_hash']]
num_vars = df.columns[df.dtypes != 'object']
cat_vars = df.columns[df.dtypes == 'object']
print(num_vars)
print(cat_vars)

Index(['subject_sex', 'arrest_made', 'outcome', 'reason_for_stop',
       'officer_id_hash'],
      dtype='object')


In [893]:
df[num_vars].isnull().sum().sort_values(ascending=False)/df.shape[0]

subject_age         0.000217
citation_issued     0.000000
search_conducted    0.000000
dtype: float64

In [894]:
meanVal = df['subject_age'].mean()
df['subject_age'].fillna(value=meanVal, inplace=True)

In [895]:
df[num_vars].isnull().sum().sort_values(ascending=False)/df.shape[0]

citation_issued     0.0
subject_age         0.0
search_conducted    0.0
dtype: float64

In [896]:
df[cat_vars].isnull().sum().sort_values(ascending=False)/df.shape[0]

outcome            0.136775
arrest_made        0.000271
subject_sex        0.000000
reason_for_stop    0.000000
officer_id_hash    0.000000
dtype: float64

In [897]:
df[cat_vars] = df[cat_vars].fillna(value="Missing data")
df[cat_vars].isnull().sum().sort_values(ascending=False)/df.shape[0]


subject_sex        0.0
arrest_made        0.0
outcome            0.0
reason_for_stop    0.0
officer_id_hash    0.0
dtype: float64

# Do men or women speed more often?

In [898]:
speeding_df = df[df['reason_for_stop'] == 'Speed Related']
gender_counts = speeding_df.groupby(['subject_sex']).size().reset_index(name='Count')
gender_counts

Unnamed: 0,subject_sex,Count
0,female,1551
1,male,1983


In [899]:
male_num = gender_counts.iloc[1,1]
female_num = gender_counts.iloc[0,1]
total_num = gender_counts['Count'].sum()

print(f"Total amount of stops related to speeding is: {total_num}")
print(f"Of which male percentage is: {round(male_num / total_num * 100)}%,")
print(f"and female percentage is: {round(female_num / total_num * 100)}%")

Total amount of stops related to speeding is: 3534
Of which male percentage is: 56%,
and female percentage is: 44%


# Does gender affect who gets searched during a stop?

In [900]:

# Calculate the total stops by gender
gender_stops_counts = df['subject_sex'].value_counts().reset_index()
gender_stops_counts


Unnamed: 0,subject_sex,count
0,male,11426
1,female,7013


In [901]:
total_stops_male = gender_stops_counts.iloc[0,1]
total_stops_female = gender_stops_counts.iloc[1,1]
total_stops = df['subject_sex'].count()

searched_male_df = df[(df['search_conducted'] == True) & (df['subject_sex'] == 'male')].shape[0]
searched_female_df = df[(df['search_conducted'] == True) & (df['subject_sex'] == 'female')].shape[0]

print(f"Total amount of stops is: {total_stops}")
print(f"Of which male who got searched percentage is: {round(total_stops_male / total_stops * 100)}%,")
print(f"and female percentage is: {round(total_stops_female / total_stops * 100)}%")

Total amount of stops is: 18439
Of which male who got searched percentage is: 62%,
and female percentage is: 38%


In [903]:

officer_id_df = speeding_df.groupby(['officer_id_hash']).size().sort_values(ascending=False).reset_index(name='Amount of arrests')
officer_id_df

Unnamed: 0,officer_id_hash,Amount of arrests
0,df0b54e2c2,905
1,09739d95d7,530
2,720f053826,478
3,81f290e452,431
4,484fe0dd72,383
...,...,...
86,8a76987f67,1
87,8e96048f74,1
88,95d30edaab,1
89,96c5c7ab1a,1
