In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# .CSV data
traffic_csv_path = fetch_data_from_URL("https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.csv?accessType=DOWNLOAD",
                    "traffic_violations.csv", sub_dir="traffic")

In [4]:
# read entire file into a dataframe
t_df = pd.read_csv(traffic_csv_path)
print(t_df.info())

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251972 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
SubAgency                  1251962 non-null object
Description                1251963 non-null object
Location                   1251970 non-null object
Latitude                   1158178 non-null float64
Longitude                  1158178 non-null float64
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone         

In [5]:
print(t_df['Year'].head())

0    2008.0
1    2001.0
2    2001.0
3    1998.0
4    2015.0
Name: Year, dtype: float64


In [6]:
# Note: there cannot be an NaN values before
# performing a type conversion
print(t_df['Year'].count())
t_df['Year'].fillna(0, inplace=True)
print(t_df['Year'].count())

1244022
1251972


In [7]:
print(t_df['Year'].astype("int").head())

0    2008
1    2001
2    2001
3    1998
4    2015
Name: Year, dtype: int64


In [8]:
# There is no inplace param, so we have to reassign back
t_df['Year'] = t_df['Year'].astype("int")

In [9]:
t_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251972 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
SubAgency                  1251962 non-null object
Description                1251963 non-null object
Location                   1251970 non-null object
Latitude                   1158178 non-null float64
Longitude                  1158178 non-null float64
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone         

In [10]:
# note that year is now an int

## Category

Reducing memory usage

In [11]:
print(t_df['Color'].head())
print(t_df['Color'].nunique()) # 6 different

0     BLACK
1     GREEN
2    SILVER
3     WHITE
4     WHITE
Name: Color, dtype: object
26


In [12]:
t_df['Color'] = t_df['Color'].astype("category") # convert

In [13]:
print(t_df['Color'].head())

0     BLACK
1     GREEN
2    SILVER
3     WHITE
4     WHITE
Name: Color, dtype: category
Categories (26, object): [BEIGE, BLACK, BLUE, BLUE, DARK, ..., SILVER, TAN, WHITE, YELLOW]


In [14]:
t_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251972 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
SubAgency                  1251962 non-null object
Description                1251963 non-null object
Location                   1251970 non-null object
Latitude                   1158178 non-null float64
Longitude                  1158178 non-null float64
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone         

**our memory use is now 326.0+ MB vs 334.3+ MB**

In [15]:
print(t_df.columns)

Index(['Date Of Stop', 'Time Of Stop', 'Agency', 'SubAgency', 'Description',
       'Location', 'Latitude', 'Longitude', 'Accident', 'Belts',
       'Personal Injury', 'Property Damage', 'Fatal', 'Commercial License',
       'HAZMAT', 'Commercial Vehicle', 'Alcohol', 'Work Zone', 'State',
       'VehicleType', 'Year', 'Make', 'Model', 'Color', 'Violation Type',
       'Charge', 'Article', 'Contributed To Accident', 'Race', 'Gender',
       'Driver City', 'Driver State', 'DL State', 'Arrest Type',
       'Geolocation'],
      dtype='object')


In [16]:
# look for others to convert to category
# we could further reduce memory by converting some of 
# these to category type
ll = [(cat, t_df[cat].nunique()) for cat in t_df.columns]
for l in ll:
    print(l)

('Date Of Stop', 2249)
('Time Of Stop', 1440)
('Agency', 1)
('SubAgency', 7)
('Description', 12335)
('Location', 182472)
('Latitude', 280386)
('Longitude', 315383)
('Accident', 1)
('Belts', 2)
('Personal Injury', 2)
('Property Damage', 2)
('Fatal', 2)
('Commercial License', 2)
('HAZMAT', 2)
('Commercial Vehicle', 2)
('Alcohol', 2)
('Work Zone', 2)
('State', 69)
('VehicleType', 33)
('Year', 306)
('Make', 3481)
('Model', 17288)
('Color', 26)
('Violation Type', 4)
('Charge', 1024)
('Article', 2)
('Contributed To Accident', 2)
('Race', 6)
('Gender', 3)
('Driver City', 7168)
('Driver State', 68)
('DL State', 70)
('Arrest Type', 19)
('Geolocation', 613475)
