In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# .CSV data
traffic_csv_path = fetch_data_from_URL("https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.csv?accessType=DOWNLOAD",
                    "traffic_violations.csv", sub_dir="traffic")

In [4]:
# read entire file into a dataframe
t_df = pd.read_csv(traffic_csv_path)
print(t_df.info())

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251972 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
SubAgency                  1251962 non-null object
Description                1251963 non-null object
Location                   1251970 non-null object
Latitude                   1158178 non-null float64
Longitude                  1158178 non-null float64
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone         

## Create series using multiple cols, and setting index

In [5]:
cid_to_subagency_ds = pd.read_csv(traffic_csv_path, index_col=["Charge"], usecols=["Charge","SubAgency"], squeeze=True)
print(cid_to_subagency_ds.head(6))

Charge
13-401(h)                          3rd district, Silver Spring
21-201(a1)                              2nd district, Bethesda
21-403(b)      6th district, Gaithersburg / Montgomery Village
21-402(b)                          3rd district, Silver Spring
21-405(e1)     6th district, Gaithersburg / Montgomery Village
21-901.1(b)                             2nd district, Bethesda
Name: SubAgency, dtype: object


In [6]:
dist_dict = {"3rd district, Silver Spring": "DIST A", "6th district, Gaithersburg / Montgomery Village": "DIST B"}

In [7]:
# will iterate through `cid_to_subagency_ds` and for each value,
# it will see if it is a key in values it's being mapped to. If
# the value is found as a key, it will return the value. If not,
# it will return NaN
cid_to_desc = cid_to_subagency_ds.map(dist_dict)
print(cid_to_desc.head(6))

Charge
13-401(h)      DIST A
21-201(a1)        NaN
21-403(b)      DIST B
21-402(b)      DIST A
21-405(e1)     DIST B
21-901.1(b)       NaN
Name: SubAgency, dtype: object


This also works for series, but the series must have a unique mapping. For example, the below example won't work. Since the description is not unique to the subdistrict (in this case this is obvious)

In [8]:
#subagency_to_desc_ds = pd.read_csv(traffic_csv_path, index_col=["SubAgency"], usecols=["Description","SubAgency"], squeeze=True)
#print(subagency_to_desc_ds.head(6))
#cid_to_desc = cid_to_subagency_ds.map(subagency_to_desc_ds)
#print(cid_to_desc.head(6))

# won't work 
# > InvalidIndexError: Reindexing only valid with uniquely valued Index objects