# Data Analysis Project 3: Police Dataset

- https://youtu.be/GyUbo45mVSE&list=PLy3lFw0OTlutzXFVwttrtaRGEEyLEdnpy

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/police_data.csv")
df.head()

Unnamed: 0,stop_date,stop_time,country_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65535 entries, 0 to 65534
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   stop_date           65535 non-null  object 
 1   stop_time           65535 non-null  object 
 2   country_name        0 non-null      float64
 3   driver_gender       61474 non-null  object 
 4   driver_age_raw      61481 non-null  float64
 5   driver_age          61228 non-null  float64
 6   driver_race         61475 non-null  object 
 7   violation_raw       61475 non-null  object 
 8   violation           61475 non-null  object 
 9   search_conducted    65535 non-null  bool   
 10  search_type         2479 non-null   object 
 11  stop_outcome        61475 non-null  object 
 12  is_arrested         61475 non-null  object 
 13  stop_duration       61475 non-null  object 
 14  drugs_related_stop  65535 non-null  bool   
dtypes: bool(2), float64(3), object(10)
memory usage: 6.6+

## Q1: Remove the column that only contains missing values
**(Data Cleaning)**

In [4]:
df.isnull().sum() / len(df) * 100

stop_date               0.000000
stop_time               0.000000
country_name          100.000000
driver_gender           6.196689
driver_age_raw          6.186007
driver_age              6.572061
driver_race             6.195163
violation_raw           6.195163
violation               6.195163
search_conducted        0.000000
search_type            96.217288
stop_outcome            6.195163
is_arrested             6.195163
stop_duration           6.195163
drugs_related_stop      0.000000
dtype: float64

In [5]:
df.drop(columns=['country_name', 'search_type'], inplace=True)

In [6]:
df.sample(3)

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,stop_outcome,is_arrested,stop_duration,drugs_related_stop
23388,1/28/2008,10:25,F,1981.0,27.0,White,Speeding,Speeding,False,Citation,False,0-15 Min,False
64435,10/15/2012,22:28,M,1988.0,24.0,White,Speeding,Speeding,True,Citation,False,16-30 Min,False
17068,5/19/2007,6:27,M,1965.0,42.0,Black,Speeding,Speeding,False,Citation,False,0-15 Min,False


## Q2: For Speeding, were Men or Women stopped more often?
**(Filtering, Value Counts)**

In [7]:
df[df['violation']=='Speeding']['driver_gender'].value_counts()

M    25517
F    11686
Name: driver_gender, dtype: int64

## Q3: Does gender affect who gets searched during a stop?
**(Groupby)**

In [8]:
df.groupby('driver_gender')[['search_conducted']].sum()

Unnamed: 0_level_0,search_conducted
driver_gender,Unnamed: 1_level_1
F,366
M,2113


## Q4: What is the mean stop_duration?
**(Mapping, datetype casting)**

In [9]:
df['stop_duration'].value_counts()

0-15 Min     47379
16-30 Min    11448
30+ Min       2647
2                1
Name: stop_duration, dtype: int64

In [10]:
stop_duration_dict = {"0-15 Min":8, "16-30 Min":24, "30+ Min":45}
df['stop_duration'] = df['stop_duration'].map(stop_duration_dict)

In [11]:
df.head()

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,Citation,False,8.0,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,Citation,False,8.0,False
2,1/23/2005,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,Citation,False,8.0,False
3,2/20/2005,17:15,M,1986.0,19.0,White,Call for Service,Other,False,Arrest Driver,True,24.0,False
4,3/14/2005,10:00,F,1984.0,21.0,White,Speeding,Speeding,False,Citation,False,8.0,False


In [12]:
print("Mean Stop Duration:")
df['stop_duration'].mean()

Mean Stop Duration:


12.572778735725672

## Q5: Compare the age distributions for each violations
**(Groupby, describe)**

In [13]:
df.groupby('violation')[['driver_age']].describe()

Unnamed: 0_level_0,driver_age,driver_age,driver_age,driver_age,driver_age,driver_age,driver_age,driver_age
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
violation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Equipment,6507.0,31.682957,11.380671,16.0,23.0,28.0,39.0,81.0
Moving violation,11876.0,36.736443,13.25835,15.0,25.0,35.0,47.0,86.0
Other,3477.0,40.362381,12.754423,16.0,30.0,41.0,50.0,86.0
Registration/plates,2240.0,32.656696,11.15078,16.0,24.0,30.0,40.0,74.0
Seat belt,3.0,30.333333,10.214369,23.0,24.5,26.0,34.0,42.0
Speeding,37120.0,33.262581,12.615781,15.0,23.0,30.0,42.0,88.0
