# Police Dataset
Here, the data from a Police Check Post is given.
The data is available as a CSV file. The Dataset will be analyzed using Pandas tools.

In [176]:
import pandas as pd
data = pd.read_csv(r'.\file.csv')
data.head()

Unnamed: 0,stop_date,stop_time,country_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


## Data Cleaning
### 1. Remove the column that only contains missing values

In [178]:
#data.shape --> (65535, 15)
data.isnull().sum() #means remove country_name

stop_date                 0
stop_time                 0
country_name          65535
driver_gender          4061
driver_age_raw         4054
driver_age             4307
driver_race            4060
violation_raw          4060
violation              4060
search_conducted          0
search_type           63056
stop_outcome           4060
is_arrested            4060
stop_duration          4060
drugs_related_stop        0
dtype: int64

In [179]:
data.drop(columns = 'country_name', inplace = True)

In [180]:
#data.isnull().sum() function validates removal was completed

### 2. For speeding, were men or women stopped more often (for speeding)

In [182]:
#data.head()
#Method 1, looking at all violation types
data.groupby('driver_gender').count()
#Conclusion --> F: 16310, M: 45164 --> Men were stopped 2.76 times more often

Unnamed: 0_level_0,stop_date,stop_time,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
driver_gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
F,16310,16310,16309,16275,16310,16310,16310,16310,366,16310,16310,16310,16310
M,45164,45164,45164,44947,45164,45164,45164,45164,2113,45164,45164,45164,45164


In [183]:
#Method 2, looking only at Speeding stops
data[data['violation'] == 'Speeding'].driver_gender.value_counts()
#Conclusion --> F: 11686, M: 25517 --> Men were stopped 2.76 times more often

driver_gender
M    25517
F    11686
Name: count, dtype: int64

### 3. Does Gender affect who gets searched during a stop?

In [185]:
data.groupby('driver_gender').search_conducted.sum()
#Conclusion --> F: 366, M: 2113 --> Men are searched more often

driver_gender
F     366
M    2113
Name: search_conducted, dtype: int64

### 4. What is the mean stop_duration

In [187]:
#have to remap the mean stop durations to allow for mean function to work
#data.head()
data['stop_duration'].value_counts()

stop_duration
0-15 Min     47379
16-30 Min    11448
30+ Min       2647
2                1
Name: count, dtype: int64

In [188]:
data['stop_duration'] = data['stop_duration'].map( {'0-15 Min' : 7.5, '16-30 Min' : 24, '30+ Min' : 45, '2':2})

In [189]:
data['stop_duration'].value_counts()

stop_duration
7.5     47379
24.0    11448
45.0     2647
2.0         1
Name: count, dtype: int64

In [190]:
#data.head()
data['stop_duration'].mean()
#Mean stop duration is 12.187254981699878

12.187254981699878

### 5. Compare the age distributions for each violation type

In [208]:
#data.head()
#decided to use the describe method grouped by violation
data.groupby('violation').driver_age.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
violation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Equipment,6507.0,31.682957,11.380671,16.0,23.0,28.0,39.0,81.0
Moving violation,11876.0,36.736443,13.25835,15.0,25.0,35.0,47.0,86.0
Other,3477.0,40.362381,12.754423,16.0,30.0,41.0,50.0,86.0
Registration/plates,2240.0,32.656696,11.15078,16.0,24.0,30.0,40.0,74.0
Seat belt,3.0,30.333333,10.214369,23.0,24.5,26.0,34.0,42.0
Speeding,37120.0,33.262581,12.615781,15.0,23.0,30.0,42.0,88.0


In [222]:
#Regarding speeding, with average age around 33, was curious about the time of day as a secondary point of interest
data[data['violation'] == 'Speeding'].stop_time.value_counts()

stop_time
9:00     178
11:00    177
10:00    176
10:30    168
9:30     157
        ... 
5:43       1
5:37       1
4:46       1
5:01       1
4:09       1
Name: count, Length: 1386, dtype: int64