### Police Dataset
Here,

The data from a Police Check Post is given.

This data is available as a CSV file. We are going to analyze this data set using the Pandas DataFrame.

In [1]:
import numpy as np
import pandas as pd


In [3]:
data = pd.read_csv('Police_Data.csv')

In [4]:
data[:5]

Unnamed: 0,stop_date,stop_time,country_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [5]:
data.shape

(65535, 15)

## Instruction ( For Data Cleaning )
## 1. Remove the column that only contains missing values

In [8]:
data.isna().all(axis=0)

stop_date             False
stop_time             False
country_name           True
driver_gender         False
driver_age_raw        False
driver_age            False
driver_race           False
violation_raw         False
violation             False
search_conducted      False
search_type           False
stop_outcome          False
is_arrested           False
stop_duration         False
drugs_related_stop    False
dtype: bool

#### From above output we can see that country_name contains all null values so we need to remove it

In [10]:
data.drop(columns='country_name', inplace=True)

In [11]:
data[:3]

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


## Question ( Based on Filtering + Value Counts )
## 2. For Speeding , were Men or Women stopped more often ?

In [21]:
speed_violation = data[data.violation == 'Speeding']
speed_violation[:5]

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
4,3/14/2005,10:00,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
6,4/1/2005,17:30,M,1969.0,36.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [22]:
speed_violation.driver_gender.value_counts()

M    25517
F    11686
Name: driver_gender, dtype: int64

## Question ( Groupby )
## 3. Does gender affect who gets searched during a stop ?

In [28]:
# df.groupby('Column_1').Column_2.sum()

data.groupby('driver_gender')['search_conducted'].sum()

driver_gender
F     366
M    2113
Name: search_conducted, dtype: int64

In [29]:
data.search_conducted.value_counts()

False    63056
True      2479
Name: search_conducted, dtype: int64

### Total search conducted are : 2479 out of which Females were only 366 and Males were only 2113

## Question ( mapping + data-type casting )
## 4. What is the mean stop_duration ?

In [32]:
data.stop_duration.value_counts()

0-15 Min     47379
16-30 Min    11448
30+ Min       2647
2                1
Name: stop_duration, dtype: int64

In [33]:
data.stop_duration = data.stop_duration.map({'0-15 Min':7.5, '16-30 Min':24, '30+ Min':45})

In [34]:
data[:4]

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,7.5,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,7.5,False
2,1/23/2005,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,7.5,False
3,2/20/2005,17:15,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,24.0,False


### Mean Stop Duration

In [36]:
data['stop_duration'].mean()

12.187420698181345

## Question ( Groupby , Describe )
## 5. Compare the age distributions for each violation

In [41]:
data.groupby('violation')['driver_age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
violation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Equipment,6507.0,31.682957,11.380671,16.0,23.0,28.0,39.0,81.0
Moving violation,11876.0,36.736443,13.25835,15.0,25.0,35.0,47.0,86.0
Other,3477.0,40.362381,12.754423,16.0,30.0,41.0,50.0,86.0
Registration/plates,2240.0,32.656696,11.15078,16.0,24.0,30.0,40.0,74.0
Seat belt,3.0,30.333333,10.214369,23.0,24.5,26.0,34.0,42.0
Speeding,37120.0,33.262581,12.615781,15.0,23.0,30.0,42.0,88.0
