In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [8]:
# .CSV data
traffic_csv_path = fetch_data_from_URL("https://data.montgomerycountymd.gov/api/views/4mse-ku6q/rows.csv?accessType=DOWNLOAD",
                    "traffic_violations.csv", sub_dir="traffic")

In [9]:
# read entire file into a dataframe
t_df = pd.read_csv(traffic_csv_path)

# summmary of dataframe
print(t_df.info())

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251972 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
SubAgency                  1251962 non-null object
Description                1251963 non-null object
Location                   1251970 non-null object
Latitude                   1158178 non-null float64
Longitude                  1158178 non-null float64
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone         

## Dropping NaN values

In [11]:
# this will drop all rows with (any) NaN values in ANY column
# this does not occur inplace by default
t_df.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1083727 entries, 1 to 1251313
Data columns (total 35 columns):
Date Of Stop               1083727 non-null object
Time Of Stop               1083727 non-null object
Agency                     1083727 non-null object
SubAgency                  1083727 non-null object
Description                1083727 non-null object
Location                   1083727 non-null object
Latitude                   1083727 non-null float64
Longitude                  1083727 non-null float64
Accident                   1083727 non-null object
Belts                      1083727 non-null object
Personal Injury            1083727 non-null object
Property Damage            1083727 non-null object
Fatal                      1083727 non-null object
Commercial License         1083727 non-null object
HAZMAT                     1083727 non-null object
Commercial Vehicle         1083727 non-null object
Alcohol                    1083727 non-null object
Work Zone         

In [13]:
# how is set to 'any' by default
# can also be set to 'all' in which all values have
# have to be NaN to be dropped (in this case there are none)
t_df.dropna(how='all').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1251972 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
SubAgency                  1251962 non-null object
Description                1251963 non-null object
Location                   1251970 non-null object
Latitude                   1158178 non-null float64
Longitude                  1158178 non-null float64
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone         

## Drop columns that contain NaN values

In [14]:
# `how` parameter again applies
t_df.dropna(subset=[""]).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251972 entries, 0 to 1251971
Data columns (total 20 columns):
Date Of Stop               1251972 non-null object
Time Of Stop               1251972 non-null object
Agency                     1251972 non-null object
Accident                   1251972 non-null object
Belts                      1251972 non-null object
Personal Injury            1251972 non-null object
Property Damage            1251972 non-null object
Fatal                      1251972 non-null object
Commercial License         1251972 non-null object
HAZMAT                     1251972 non-null object
Commercial Vehicle         1251972 non-null object
Alcohol                    1251972 non-null object
Work Zone                  1251972 non-null object
VehicleType                1251972 non-null object
Violation Type             1251972 non-null object
Charge                     1251972 non-null object
Contributed To Accident    1251972 non-null object
Race                

## Subset

In [15]:
# We'll only drop rows if they contain a `NaN` in the
# column `Description`
t_df.dropna(subset=["Description"]).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1251963 entries, 0 to 1251971
Data columns (total 35 columns):
Date Of Stop               1251963 non-null object
Time Of Stop               1251963 non-null object
Agency                     1251963 non-null object
SubAgency                  1251953 non-null object
Description                1251963 non-null object
Location                   1251961 non-null object
Latitude                   1158171 non-null float64
Longitude                  1158171 non-null float64
Accident                   1251963 non-null object
Belts                      1251963 non-null object
Personal Injury            1251963 non-null object
Property Damage            1251963 non-null object
Fatal                      1251963 non-null object
Commercial License         1251963 non-null object
HAZMAT                     1251963 non-null object
Commercial Vehicle         1251963 non-null object
Alcohol                    1251963 non-null object
Work Zone         