### Do the genders commit different violations?
#### Counting unique values
- `value_counts():` Counts the unique values in a Series
- Best suited for categorical data

In [1]:
import pandas as pd

ri = pd.read_csv('police.csv')

In [2]:
ri.stop_outcome.value_counts()

Citation            77092
Arrest Driver        2735
No Action             625
N/D                   607
Arrest Passenger      343
Name: stop_outcome, dtype: int64

In [3]:
ri.stop_outcome.value_counts().sum()

86539

In [4]:
ri.shape

(91741, 15)

### Expressing counts as proportions

In [5]:
ri.stop_outcome.value_counts(normalize=True)

Citation            0.890835
Arrest Driver       0.031604
No Action           0.007222
N/D                 0.007014
Arrest Passenger    0.003964
Name: stop_outcome, dtype: float64

### Filtering DataFrame rows

In [6]:
ri.driver_race.value_counts()

White       61872
Black       12285
Hispanic     9727
Asian        2390
Other         265
Name: driver_race, dtype: int64

In [7]:
white = ri[ri.driver_race == 'White']

In [8]:
white.shape

(61872, 15)

### Comparing stop outcomes for two groups

In [9]:
white.stop_outcome.value_counts(normalize=True)

Citation            0.902234
Arrest Driver       0.024017
No Action           0.007047
N/D                 0.006433
Arrest Passenger    0.002748
Name: stop_outcome, dtype: float64

In [10]:
asian = ri[ri.driver_race == 'Asian']

In [11]:
asian.stop_outcome.value_counts(normalize=True)

Citation            0.923013
Arrest Driver       0.017573
No Action           0.008368
N/D                 0.004184
Arrest Passenger    0.001674
Name: stop_outcome, dtype: float64

In [12]:
# Count the unique values in 'violation'
print(ri.violation.value_counts())

# Express the counts as proportions
print(ri.violation.value_counts(normalize=True))

Speeding               48424
Moving violation       16224
Equipment              10922
Other                   4410
Registration/plates     3703
Seat belt               2856
Name: violation, dtype: int64
Speeding               0.559563
Moving violation       0.187476
Equipment              0.126209
Other                  0.050960
Registration/plates    0.042790
Seat belt              0.033002
Name: violation, dtype: float64


In [13]:
# Create a DataFrame of female drivers
female = ri[ri.driver_gender == 'F']

# Create a DataFrame of male drivers
male = ri[ri.driver_gender == 'M']

# Compute the violations by female drivers (as proportions)
print(female.violation.value_counts(normalize=True))

# Compute the violations by male drivers (as proportions)
print(male.violation.value_counts(normalize=True))

Speeding               0.658114
Moving violation       0.138218
Equipment              0.105199
Registration/plates    0.044418
Other                  0.029738
Seat belt              0.024312
Name: violation, dtype: float64
Speeding               0.522243
Moving violation       0.206144
Equipment              0.134158
Other                  0.058985
Registration/plates    0.042175
Seat belt              0.036296
Name: violation, dtype: float64


### Does gender affect who gets a ticket for speeding?

### Filtering by multiple conditions

In [14]:
female = ri[ri.driver_gender == 'F']
female.shape

(23774, 15)

In [15]:
female_and_arrested = ri[(ri.driver_gender == 'F') &
                        (ri.is_arrested == True)]

- Each conditions is surrounded by parentheses
- Ampersand (&) represents the `and` operator

In [16]:
female_and_arrested.shape

(669, 15)

In [17]:
female_or_arrested = ri[(ri.driver_gender == 'F') |
                       (ri.is_arrested == True)]
female_or_arrested.shape

(26183, 15)

### Rules for filtering by multiple conditions
- Ampersand (&): only include rows that satisfy both conditions
- Pipe (|): include rows that satisfy either condition
- Each condition must be surrounded by parentheses
- Conditions can check for equality (==), inequality (!=), etc
- Can use more than two conditions

### Correlation, not causation
- Analyze the relationship between gender and stop outcome
 - Assess whether there is a correlation
- Not going to draw any conclusions about causation
 - Would need additional data and expertise
 - Exploring relationships only

In [18]:
# Create a DataFrame of female drivers stopped for speeding
female_and_speeding = ri[(ri.driver_gender == 'F') & (ri.violation == 'Speeding')]

# Create a DataFrame of male drivers stopped for speeding
male_and_speeding = ri[(ri.driver_gender == 'M') & (ri.violation == 'Speeding')]

# Compute the stop outcomes for female drivers (as proportions)
print(female_and_speeding.stop_outcome.value_counts(normalize = True))

# Compute the stop outcomes for male drivers (as proportions)
print(male_and_speeding.stop_outcome.value_counts(normalize = True))

Citation            0.952192
Arrest Driver       0.005752
N/D                 0.000959
Arrest Passenger    0.000639
No Action           0.000383
Name: stop_outcome, dtype: float64
Citation            0.944595
Arrest Driver       0.015895
Arrest Passenger    0.001281
No Action           0.001068
N/D                 0.000976
Name: stop_outcome, dtype: float64


### Does gender affect whose vehicle is searched?

### Math with boolean values

In [19]:
ri.isnull().sum()

state                     0
stop_date                 0
stop_time                 0
county_name           91741
driver_gender          5205
driver_race            5202
violation_raw          5202
violation              5202
search_conducted          0
search_type           88434
stop_outcome           5202
is_arrested            5202
stop_duration          5202
drugs_related_stop        0
district                  0
dtype: int64

In [20]:
import numpy as np

### Taking the mean of a Boolean series

In [21]:
ri.is_arrested.value_counts(normalize=True)

False    0.964432
True     0.035568
Name: is_arrested, dtype: float64

In [22]:
ri.is_arrested.mean()

0.03556777868937704

In [23]:
ri.is_arrested.dtype

dtype('O')

### Comparing groups using groupby
- Study the arrest rate by police district

In [24]:
ri.district.unique()

array(['Zone X4', 'Zone K3', 'Zone X1', 'Zone X3', 'Zone K1', 'Zone K2'],
      dtype=object)

In [25]:
ri[ri.district == 'Zone K1'].is_arrested.mean()

0.024346149210558034

In [26]:
ri[ri.district == 'Zone K2'].is_arrested.mean()

0.030800588834786546

In [27]:
ri['is_arrested'] = ri.is_arrested.astype('bool')

In [28]:
ri.groupby('district').is_arrested.mean()

district
Zone K1    0.067181
Zone K2    0.069522
Zone K3    0.072286
Zone X1    0.230044
Zone X3    0.082466
Zone X4    0.117674
Name: is_arrested, dtype: float64

### Grouping by multiple categories

In [29]:
ri.groupby(['district', 'driver_gender']).is_arrested.mean()

district  driver_gender
Zone K1   F                0.019169
          M                0.026588
Zone K2   F                0.022196
          M                0.034285
Zone K3   F                0.025156
          M                0.034961
Zone X1   F                0.019646
          M                0.024563
Zone X3   F                0.027188
          M                0.038166
Zone X4   F                0.042149
          M                0.049956
Name: is_arrested, dtype: float64

In [30]:
ri.groupby(['driver_gender', 'district']).is_arrested.mean()

driver_gender  district
F              Zone K1     0.019169
               Zone K2     0.022196
               Zone K3     0.025156
               Zone X1     0.019646
               Zone X3     0.027188
               Zone X4     0.042149
M              Zone K1     0.026588
               Zone K2     0.034285
               Zone K3     0.034961
               Zone X1     0.024563
               Zone X3     0.038166
               Zone X4     0.049956
Name: is_arrested, dtype: float64

In [32]:
# Check the data type of 'search_conducted'
print(ri.search_conducted.dtype)

# Calculate the search rate by counting the values
print(ri.search_conducted.value_counts(normalize=True))

# Calculate the search rate by taking the mean
print(ri.search_conducted.mean())

bool
False    88434
True      3307
Name: search_conducted, dtype: int64
0.03604713268876511
