# Capstone 1: Data Wrangling

### Data Acquisition: Get source data from NYC Open Data API

In [1]:
# Import packages and modules
import pandas as pd
import numpy as np
import datetime

In [2]:
# Import Python library for Socrata Open Data API
from sodapy import Socrata

frames = []
batch_size = 500000

# Import dataset via context manager
for i in range(18):
    with Socrata("data.cityofnewyork.us", None) as client:
        results = client.get("66ae-7zpy", limit=batch_size,offset=i*batch_size)
    print('Preparing to load Batch {} to dataframe object...'.format(i+1))
    temp_df = pd.DataFrame.from_records(results)
    print('Batch {} loaded'.format(i+1))
    frames.append(temp_df)
    print('Dataframe object (df{}) appended to list\n'.format(i+1))
print('\nAll dataframe objects have been appended to list')



Preparing to load Batch 1 to dataframe object...




Batch 1 loaded
Dataframe object (df1) appended to list

Preparing to load Batch 2 to dataframe object...




Batch 2 loaded
Dataframe object (df2) appended to list

Preparing to load Batch 3 to dataframe object...




Batch 3 loaded
Dataframe object (df3) appended to list

Preparing to load Batch 4 to dataframe object...




Batch 4 loaded
Dataframe object (df4) appended to list

Preparing to load Batch 5 to dataframe object...




Batch 5 loaded
Dataframe object (df5) appended to list

Preparing to load Batch 6 to dataframe object...




Batch 6 loaded
Dataframe object (df6) appended to list

Preparing to load Batch 7 to dataframe object...




Batch 7 loaded
Dataframe object (df7) appended to list

Preparing to load Batch 8 to dataframe object...




Batch 8 loaded
Dataframe object (df8) appended to list

Preparing to load Batch 9 to dataframe object...




Batch 9 loaded
Dataframe object (df9) appended to list

Preparing to load Batch 10 to dataframe object...




Batch 10 loaded
Dataframe object (df10) appended to list

Preparing to load Batch 11 to dataframe object...




Batch 11 loaded
Dataframe object (df11) appended to list

Preparing to load Batch 12 to dataframe object...




Batch 12 loaded
Dataframe object (df12) appended to list

Preparing to load Batch 13 to dataframe object...




Batch 13 loaded
Dataframe object (df13) appended to list

Preparing to load Batch 14 to dataframe object...




Batch 14 loaded
Dataframe object (df14) appended to list

Preparing to load Batch 15 to dataframe object...




Batch 15 loaded
Dataframe object (df15) appended to list

Preparing to load Batch 16 to dataframe object...




Batch 16 loaded
Dataframe object (df16) appended to list

Preparing to load Batch 17 to dataframe object...




Batch 17 loaded
Dataframe object (df17) appended to list

Preparing to load Batch 18 to dataframe object...




Batch 18 loaded
Dataframe object (df18) appended to list

Preparing to load Batch 19 to dataframe object...
Batch 19 loaded
Dataframe object (df19) appended to list


All dataframe objects have been appended to list


### Data Cleansing: Define necessary functions

In [3]:
# Define function to drop immaterial rows and columns from dataframe object
def drop_from_df(dfObj):
    """
    Remove columns from dataframe object 
    that are not required for analysis
    """
    # Drop all rows with missing value for 'incident_disposition_code'
    dfObj.dropna(subset=['incident_disposition_code'],inplace=True)
    
    # Identify all columns that contain incident indicator data
    list_of_indicator_cols = [name for name in list(dfObj.columns) 
                              if 'indicator' in str(name).lower() and name !='held_indicator']
    
    # Drop all rows that pertain to outlier incidents
    for name in list_of_indicator_cols:
        index_names = dfObj[dfObj[name]=='Y'].index
        dfObj.drop(index_names, inplace=True)
    
    # Remove columns that contain incident indicator data
    dfObj.drop(list_of_indicator_cols,axis=1,inplace=True)
    
    # Identify and remove all columns that contain district or precinct data (geographic zones)
    list_of_zone_cols = [name for name in list(dfObj.columns) 
                         if ('district' in str(name).lower() or name=='policeprecinct')]
    dfObj.drop(list_of_zone_cols,axis=1,inplace=True)
    
    return dfObj

In [4]:
# Define function to modify dtypes of series in dataframe object
def modify_dtype(dfObj):
    """
    Change the dtype of select columns in 
    dataframe object based on its implied value
    """
    # Create list of all columns that contain ISO8601 datetime
    list_of_datetime_cols = [name for name in list(dfObj.columns) 
                             if 'datetime' in str(name).lower()]

    # Convert dtypes for each element in list to datetime
    for name in list_of_datetime_cols:
        dfObj[name]=pd.to_datetime(dfObj[name],errors='coerce')
       
    # Create list of all columns that contain time duration
    list_of_duration_cols = [name for name in list(dfObj.columns) 
                             if 'seconds' in str(name).lower()]

    # Convert dtypes for each element in list to numeric
    for name in list_of_duration_cols:
        dfObj[name]=pd.to_numeric(dfObj[name],errors='coerce')
        
    # Convert columns to category dtypes to reduce size of dataframe object
    dfObj['borough'] = dfObj.borough.astype('category')
    dfObj['held_indicator'] = dfObj.held_indicator.astype('category')
    dfObj['valid_dispatch_rspns_time_indc'] = dfObj.valid_dispatch_rspns_time_indc.astype('category')
    dfObj['valid_incident_rspns_time_indc'] = dfObj.valid_incident_rspns_time_indc.astype('category')
    dfObj['incident_dispatch_area'] = dfObj.incident_dispatch_area.astype('category')
        
    return dfObj

In [5]:
def redesign_df(dfObj):
    """
    Restructure dataframe object
    """
    # CREATE PANDAS SERIES FOR TARGET VARIABLE: fatality
    dfObj['fatality'] = np.logical_or(dfObj.incident_disposition_code == '83',\
                                   dfObj.incident_disposition_code == '96')

    # Create separate columns for the year and month of the incident
    dfObj['incident_year'] = pd.DatetimeIndex(dfObj.incident_datetime).year
    dfObj['incident_month'] = pd.DatetimeIndex(dfObj.incident_datetime).month

    # Reorder dataframe columns
    col_order = ['incident_year','incident_month','cad_incident_id',\
                 'incident_datetime','borough','zipcode',\
                 'incident_dispatch_area','held_indicator',\
                 'initial_call_type','initial_severity_level_code',\
                 'final_call_type','final_severity_level_code',\
                 'first_assignment_datetime','valid_dispatch_rspns_time_indc',\
                 'dispatch_response_seconds_qy','first_activation_datetime',\
                 'first_on_scene_datetime','valid_incident_rspns_time_indc',\
                 'incident_response_seconds_qy','incident_travel_tm_seconds_qy',\
                 'first_to_hosp_datetime','first_hosp_arrival_datetime',\
                 'incident_close_datetime',
                 'incident_disposition_code','fatality']
    dfObj=dfObj[col_order]
    
    # Create a multindex on incident_year and incident_month
    dfObj.set_index(['incident_year','incident_month','cad_incident_id'])

    return dfObj

### Data Cleansing: Apply functions to dataset

In [14]:
# Apply drop_from_df to all dataframe objects in frames
count = 1
for frame in frames:
    frame = drop_from_df(frame)
    print('Dataframe {}: Removed rows and columns'.format(count))
    count += 1

Dataframe 1: Removed rows and columns
Dataframe 2: Removed rows and columns
Dataframe 3: Removed rows and columns
Dataframe 4: Removed rows and columns
Dataframe 5: Removed rows and columns
Dataframe 6: Removed rows and columns
Dataframe 7: Removed rows and columns
Dataframe 8: Removed rows and columns
Dataframe 9: Removed rows and columns
Dataframe 10: Removed rows and columns
Dataframe 11: Removed rows and columns
Dataframe 12: Removed rows and columns
Dataframe 13: Removed rows and columns
Dataframe 14: Removed rows and columns
Dataframe 15: Removed rows and columns
Dataframe 16: Removed rows and columns
Dataframe 17: Removed rows and columns
Dataframe 18: Removed rows and columns


In [15]:
# Apply modify_dtype to all dataframe objects in frames
count = 1
for frame in frames:
    frame = modify_dtype(frame)
    print("Dataframe {}: Modified dtypes".format(count))
    count +=1

Dataframe 1: Modified dtypes
Dataframe 2: Modified dtypes
Dataframe 3: Modified dtypes
Dataframe 4: Modified dtypes
Dataframe 5: Modified dtypes
Dataframe 6: Modified dtypes
Dataframe 7: Modified dtypes
Dataframe 8: Modified dtypes
Dataframe 9: Modified dtypes
Dataframe 10: Modified dtypes
Dataframe 11: Modified dtypes
Dataframe 12: Modified dtypes
Dataframe 13: Modified dtypes
Dataframe 14: Modified dtypes
Dataframe 15: Modified dtypes
Dataframe 16: Modified dtypes
Dataframe 17: Modified dtypes
Dataframe 18: Modified dtypes


In [16]:
# Apply redesign_df to all dataframe objects in frames
count = 1
for frame in frames:
    frame = redesign_df(frame)
    print("Dataframe {}: Redesigned df object".format(count))
    count +=1

Dataframe 1: Redesigned df object
Dataframe 2: Redesigned df object
Dataframe 3: Redesigned df object
Dataframe 4: Redesigned df object
Dataframe 5: Redesigned df object
Dataframe 6: Redesigned df object
Dataframe 7: Redesigned df object
Dataframe 8: Redesigned df object
Dataframe 9: Redesigned df object
Dataframe 10: Redesigned df object
Dataframe 11: Redesigned df object
Dataframe 12: Redesigned df object
Dataframe 13: Redesigned df object
Dataframe 14: Redesigned df object
Dataframe 15: Redesigned df object
Dataframe 16: Redesigned df object
Dataframe 17: Redesigned df object
Dataframe 18: Redesigned df object


### Merge all dataframe objects

In [17]:
# Concatenate all dataframe objects
df = pd.concat(frames,ignore_index=True)
print('Concatenated all dataframe objects in frames')

Concatenated all dataframe objects in frames


In [21]:
df.head()

Unnamed: 0,incident_year,incident_month,cad_incident_id,incident_datetime,borough,zipcode,incident_dispatch_area,held_indicator,initial_call_type,initial_severity_level_code,...,first_activation_datetime,first_on_scene_datetime,valid_incident_rspns_time_indc,incident_response_seconds_qy,incident_travel_tm_seconds_qy,first_to_hosp_datetime,first_hosp_arrival_datetime,incident_close_datetime,incident_disposition_code,fatality
0,2013,1,130010001,2013-01-01 00:00:04,BRONX,10472,B3,N,RESPIR,4,...,2013-01-01 00:01:51,2013-01-01 00:13:21,Y,797.0,696.0,2013-01-01 00:28:49,2013-01-01 00:38:15,2013-01-01 01:04:56,82,False
1,2013,1,130010002,2013-01-01 00:00:19,BRONX,10454,B1,N,CARD,3,...,2013-01-01 00:02:08,2013-01-01 00:14:30,Y,851.0,792.0,NaT,NaT,2013-01-01 00:55:34,93,False
2,2013,1,130010004,2013-01-01 00:01:04,QUEENS,11418,Q3,N,ARREST,1,...,2013-01-01 00:01:58,2013-01-01 00:08:13,Y,429.0,400.0,NaT,NaT,2013-01-01 00:38:05,83,True
3,2013,1,130010005,2013-01-01 00:01:16,BRONX,10453,B2,N,SICK,6,...,2013-01-01 00:02:55,2013-01-01 00:15:04,Y,828.0,772.0,2013-01-01 00:34:54,2013-01-01 00:53:02,2013-01-01 01:20:28,82,False
4,2013,1,130010006,2013-01-01 00:01:26,BRONX,10457,B2,N,INJURY,5,...,2013-01-01 00:02:55,2013-01-01 00:15:42,Y,856.0,824.0,2013-01-01 00:27:42,2013-01-01 00:31:13,2013-01-01 00:53:12,82,False


In [22]:
df.shape

(8431649, 25)

In [23]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8431649 entries, 0 to 8431648
Data columns (total 25 columns):
incident_year                     int64
incident_month                    int64
cad_incident_id                   object
incident_datetime                 datetime64[ns]
borough                           object
zipcode                           object
incident_dispatch_area            object
held_indicator                    category
initial_call_type                 object
initial_severity_level_code       object
final_call_type                   object
final_severity_level_code         object
first_assignment_datetime         datetime64[ns]
valid_dispatch_rspns_time_indc    object
dispatch_response_seconds_qy      int64
first_activation_datetime         datetime64[ns]
first_on_scene_datetime           datetime64[ns]
valid_incident_rspns_time_indc    category
incident_response_seconds_qy      float64
incident_travel_tm_seconds_qy     float64
first_to_hosp_datetime          

In [25]:
# Export dataframe to CSV
print('Exporting dataframe to CSV...')
df.to_csv('data/clean_comp_df.csv',index=False,compression='gzip')
print('Dataframe successfully exported to CSV using \'gzip\' compression.')

Exporting dataframe to CSV...
Dataframe successfully exported to CSV using 'gzip' compression.
