In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# .CSV data
traffic_csv_path = fetch_data_from_URL("https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.csv?accessType=DOWNLOAD",
                    "traffic_violations.csv", sub_dir="traffic")

In [4]:
# read entire file into a dataframe
t_df = pd.read_csv(traffic_csv_path)
print(t_df.info())

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251972 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
SubAgency                  1251962 non-null object
Description                1251963 non-null object
Location                   1251970 non-null object
Latitude                   1158178 non-null float64
Longitude                  1158178 non-null float64
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone         

In [5]:
cid_ds = t_df['Charge']
print(cid_ds.value_counts().head(6))

21-801.1      162237
21-201(a1)     99235
13-409(b)      59747
13-401(h)      43626
21-707(a)      39833
16-112(c)      32929
Name: Charge, dtype: int64


In [6]:
# custom function
# https://law.justia.com/codes/maryland/2005/gtr/13-401.html
# 13-401 seems to relate to ~registration
# 21-403 seems to relate to ~stop signs
# 21-801 seems to related to ~speeding
def classify_charge(charge_id):
    if charge_id.startswith("21-801"):
        return "speed related"
    elif charge_id == "13-401(h)":
        return "registration of a vehicle is suspended"
    elif charge_id.startswith("13-401"):
        return "registration related"
    elif charge_id.startswith("21-403"):
        return "stop sign"
    else:
        return "not classified"

In [7]:
print(cid_ds.apply(classify_charge).head(60))

0     registration of a vehicle is suspended
1                             not classified
2                                  stop sign
3                             not classified
4                             not classified
5                             not classified
6     registration of a vehicle is suspended
7                             not classified
8                             not classified
9                             not classified
10                            not classified
11                            not classified
12                            not classified
13                            not classified
14                            not classified
15                            not classified
16                            not classified
17                            not classified
18                            not classified
19                            not classified
20                            not classified
21                            not classified
22        

In [8]:
# using lambda
print(cid_ds.apply(lambda charge_id: "stop sign" if charge_id.startswith("21-403") else "unsure").head())

0       unsure
1       unsure
2    stop sign
3       unsure
4       unsure
Name: Charge, dtype: object
