# Capstone 1: Data Wrangling (from API)

In [1]:
# Import packages and modules
import pandas as pd
import numpy as np
import datetime

from sodapy import Socrata # library for Socrata Open Data API (SODA)

### Data Cleansing: Define functions

REMOVE UNNECESSARY VALUES (**drop_errors** and **drop_immaterial**)

For the purpose of this analysis, any observation with a missing value for 'incident_disposition_code' must be omitted since the target variable is derived from this feature. In addition, observations that contain the following outliers, errors, or immaterial information must also be removed from the dataset:
+ incidents created to transport a patient from one facility to another
+ incidents where units were assigned to stand by in case they were needed
+ incidents that pertain to special events
+ incidents that were once closed but later reopened
+ incidents with calculation errors for duration metrics
+ features that contain redundant geographic information for incident

In [2]:
def drop_errors(dfObj):    
    # Drop all rows with missing value for 'incident_disposition_code'
    dfObj.dropna(subset=['incident_disposition_code'],inplace=True)
    
    # Identify all columns that validate duration metrics
    list_of_validation_cols = [name for name in list(dfObj.columns)
                               if 'valid' in str(name)]
    
    # Drop all rows with invalid duration metrics
    for name in list_of_validation_cols:
        invalid_idx = dfObj[dfObj[name]=='N'].index
        dfObj.drop(invalid_idx, inplace=True)
    
    # Drop all rows where EMS were not dispatched
    no_disp_idx = dfObj[dfObj.dispatch_response_seconds_qy==0].index
    dfObj.drop(no_disp_idx, inplace=True)
    
    return dfObj

In [3]:
def drop_immaterial(dfObj):    
    # Identify all columns with outlier event indicators
    list_of_indicator_cols = [name for name in list(dfObj.columns) 
                              if 'indicator' in str(name) and name !='held_indicator']
    
    # Drop all rows that pertain to outlier incidents
    for name in list_of_indicator_cols:
        outlier_idx = dfObj[dfObj[name]=='Y'].index
        dfObj.drop(outlier_idx, inplace=True)
    
    # Remove columns that contain incident indicator data
    dfObj.drop(list_of_indicator_cols,axis=1,inplace=True)
    
    # Identify and remove all columns that contain redundant geographic data
    list_of_zone_cols = [name for name in list(dfObj.columns) 
                         if ('district' in str(name) or name=='policeprecinct')]
    dfObj.drop(list_of_zone_cols,axis=1,inplace=True)
    
    return dfObj

REDUCE SIZE OF DATAFRAME (**reduce_memory**)

Modifying the data types for values contained within select columns will drastically reduce the memory usage of the dataframe object.

In [4]:
def reduce_memory(dfObj):
    # Truncate name for borough label: 'RICHMOND / STATEN ISLAND'
    dfObj['borough'] = dfObj.borough.replace('RICHMOND / STATEN ISLAND',
                                             'STATEN ISLAND')
    
    # Create list of all columns that contain ISO8601 datetime
    list_of_datetime_cols = [name for name in list(dfObj.columns) 
                             if 'datetime' in str(name)]

    # Convert dtypes for each element in list to datetime
    for name in list_of_datetime_cols:
        dfObj[name] = pd.to_datetime(dfObj[name],errors='coerce')
       
    # Create list of all columns that contain time duration
    list_of_numeric_cols = [name for name in list(dfObj.columns) 
                            if (('seconds' in str(name))|
                                ('severity' in str(name))|
                                ('cad' in str(name)))]

    # Convert dtypes for each element in list to numeric
    for name in list_of_numeric_cols:
        dfObj[name] = pd.to_numeric(dfObj[name],errors='coerce')
        
    # Convert columns to category dtypes to reduce size of dataframe object
    dfObj['borough'] = dfObj.borough.astype('category')
    dfObj['zipcode'] = dfObj.zipcode.astype('category')
    dfObj['held_indicator'] = dfObj.held_indicator.astype('category')
    dfObj['valid_dispatch_rspns_time_indc'] = dfObj.valid_dispatch_rspns_time_indc.astype('category')
    dfObj['valid_incident_rspns_time_indc'] = dfObj.valid_incident_rspns_time_indc.astype('category')
    dfObj['incident_dispatch_area'] = dfObj.incident_dispatch_area.astype('category')
    dfObj['incident_disposition_code'] = dfObj.incident_disposition_code.astype('category')
    
    return dfObj

REDESIGN THE DATAFRAME (**format_df**)

Construct a boolean series that represents the target variable (fatality) using the corresponding values in 'incident_disposition_code'. Also, apply aesthetic changes to help improve the readability of the dataframe object.

In [5]:
def format_df(dfObj):
    # Create a series for the target variable: fatality
    dfObj['fatality'] = np.logical_or(dfObj.incident_disposition_code.astype('int64') == 83,
                                      dfObj.incident_disposition_code.astype('int64') == 96)

    # Create separate columns for the year and month of the incident
    dfObj['incident_year'] = pd.DatetimeIndex(dfObj.incident_datetime).year
    dfObj['incident_month'] = pd.DatetimeIndex(dfObj.incident_datetime).month

    # Reorder dataframe columns
    col_order = ['incident_year','incident_month','cad_incident_id',
                 'incident_datetime','borough','zipcode',
                 'initial_call_type','initial_severity_level_code',
                 'final_call_type','final_severity_level_code',
                 'held_indicator','first_assignment_datetime',
                 'incident_dispatch_area','valid_dispatch_rspns_time_indc',
                 'dispatch_response_seconds_qy','first_activation_datetime',
                 'first_on_scene_datetime','incident_travel_tm_seconds_qy',
                 'valid_incident_rspns_time_indc','incident_response_seconds_qy',
                 'first_to_hosp_datetime','first_hosp_arrival_datetime',
                 'incident_close_datetime',
                 'incident_disposition_code','fatality']
    dfObj=dfObj[col_order]
    
    return dfObj

### Data Acquisition: Inspect sample of source data

In [6]:
# Import sample of dataset via context manager
with Socrata("data.cityofnewyork.us", None) as client:
    results = client.get("66ae-7zpy", limit=1000)
    preview_df = pd.DataFrame.from_records(results)



In [7]:
preview_df.shape

(1000, 31)

In [8]:
preview_df.head()

Unnamed: 0,borough,cad_incident_id,citycouncildistrict,communitydistrict,communityschooldistrict,congressionaldistrict,dispatch_response_seconds_qy,final_call_type,final_severity_level_code,first_activation_datetime,...,initial_call_type,initial_severity_level_code,policeprecinct,reopen_indicator,special_event_indicator,standby_indicator,transfer_indicator,valid_dispatch_rspns_time_indc,valid_incident_rspns_time_indc,zipcode
0,BRONX,130010001,18,209,12,15,101,RESPIR,4,2013-01-01T00:01:51.000,...,RESPIR,4,43,N,N,N,N,Y,Y,10472
1,BRONX,130010002,8,201,7,15,59,CARD,3,2013-01-01T00:02:08.000,...,CARD,3,40,N,N,N,N,Y,Y,10454
2,QUEENS,130010004,29,409,27,5,29,ARREST,1,2013-01-01T00:01:58.000,...,ARREST,1,102,N,N,N,N,Y,Y,11418
3,BRONX,130010005,14,205,10,15,56,SICK,6,2013-01-01T00:02:55.000,...,SICK,6,46,N,N,N,N,Y,Y,10453
4,BRONX,130010006,15,206,10,15,32,INJURY,5,2013-01-01T00:02:55.000,...,INJURY,5,48,N,N,N,N,Y,Y,10457


In [9]:
preview_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
borough                           1000 non-null object
cad_incident_id                   1000 non-null object
citycouncildistrict               985 non-null object
communitydistrict                 985 non-null object
communityschooldistrict           985 non-null object
congressionaldistrict             985 non-null object
dispatch_response_seconds_qy      1000 non-null object
final_call_type                   1000 non-null object
final_severity_level_code         1000 non-null object
first_activation_datetime         932 non-null object
first_assignment_datetime         935 non-null object
first_hosp_arrival_datetime       513 non-null object
first_on_scene_datetime           890 non-null object
first_to_hosp_datetime            522 non-null object
held_indicator                    1000 non-null object
incident_close_datetime           1000 non-null object
incident_datetime      

### Data Acquisition: Obtain complete dataset from NYC Open Data API

In [10]:
# Import Python library for Socrata Open Data API
from sodapy import Socrata

frames = []
batch_size = 500000

# Import dataset via context manager
with Socrata("data.cityofnewyork.us", None) as client:
    for i in range(18):
        results = client.get("66ae-7zpy", limit=batch_size,offset=i*batch_size)
        temp_df = pd.DataFrame.from_records(results)
        print('Batch {} loaded to dataframe object...'.format(i+1))
        
        # Apply all data cleansing functions
        temp_df = drop_errors(temp_df)
        temp_df = drop_immaterial(temp_df)
        temp_df = reduce_memory(temp_df)
        temp_df = format_df(temp_df)
        print('Dataframe object (df{}) cleaned'.format(i+1))
        
        # Append cleaned temp_df to list
        frames.append(temp_df)
        print('Dataframe object (df{}) appended to list\n'.format(i+1))
        
print('\nAll dataframe objects have been appended to list')



Batch 1 loaded to dataframe object...
Dataframe object (df1) cleaned
Dataframe object (df1) appended to list

Batch 2 loaded to dataframe object...
Dataframe object (df2) cleaned
Dataframe object (df2) appended to list

Batch 3 loaded to dataframe object...
Dataframe object (df3) cleaned
Dataframe object (df3) appended to list

Batch 4 loaded to dataframe object...
Dataframe object (df4) cleaned
Dataframe object (df4) appended to list

Batch 5 loaded to dataframe object...
Dataframe object (df5) cleaned
Dataframe object (df5) appended to list

Batch 6 loaded to dataframe object...
Dataframe object (df6) cleaned
Dataframe object (df6) appended to list

Batch 7 loaded to dataframe object...
Dataframe object (df7) cleaned
Dataframe object (df7) appended to list

Batch 8 loaded to dataframe object...
Dataframe object (df8) cleaned
Dataframe object (df8) appended to list

Batch 9 loaded to dataframe object...
Dataframe object (df9) cleaned
Dataframe object (df9) appended to list

Batch 10 l

A downloadable description of each dataset field is available at https://data.cityofnewyork.us/Public-Safety/EMS-Incident-Dispatch-Data/76xm-jjuj in the _Attachments_ section under the file name **EMS_incident_dispatch_data_description.xlsx**. 

### Merge all dataframe objects

In [11]:
# Concatenate all dataframe objects
df = pd.concat(frames,ignore_index=True)
print('Concatenated all dataframe objects in frames')

Concatenated all dataframe objects in frames


In [12]:
# Adjust dtypes for memory reduction
df['borough'] = df.borough.astype('category')
df['zipcode'] = df.zipcode.astype('category')
df['incident_dispatch_area'] = df.incident_dispatch_area.astype('category')

In [13]:
# Create a MultiIndex on 'incident_year', 'incident_month' and 'cad_incident_id'
df.set_index(['incident_year','incident_month','cad_incident_id'],inplace=True)

### Inspect clean dataframe

In [14]:
df.shape

(8228125, 22)

In [15]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,incident_datetime,borough,zipcode,initial_call_type,initial_severity_level_code,final_call_type,final_severity_level_code,held_indicator,first_assignment_datetime,incident_dispatch_area,...,first_activation_datetime,first_on_scene_datetime,incident_travel_tm_seconds_qy,valid_incident_rspns_time_indc,incident_response_seconds_qy,first_to_hosp_datetime,first_hosp_arrival_datetime,incident_close_datetime,incident_disposition_code,fatality
incident_year,incident_month,cad_incident_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2013,1,130010001,2013-01-01 00:00:04,BRONX,10472,RESPIR,4,RESPIR,4,N,2013-01-01 00:01:45,B3,...,2013-01-01 00:01:51,2013-01-01 00:13:21,696.0,Y,797,2013-01-01 00:28:49,2013-01-01 00:38:15,2013-01-01 01:04:56,82,False
2013,1,130010002,2013-01-01 00:00:19,BRONX,10454,CARD,3,CARD,3,N,2013-01-01 00:01:18,B1,...,2013-01-01 00:02:08,2013-01-01 00:14:30,792.0,Y,851,NaT,NaT,2013-01-01 00:55:34,93,False
2013,1,130010004,2013-01-01 00:01:04,QUEENS,11418,ARREST,1,ARREST,1,N,2013-01-01 00:01:33,Q3,...,2013-01-01 00:01:58,2013-01-01 00:08:13,400.0,Y,429,NaT,NaT,2013-01-01 00:38:05,83,True
2013,1,130010005,2013-01-01 00:01:16,BRONX,10453,SICK,6,SICK,6,N,2013-01-01 00:02:12,B2,...,2013-01-01 00:02:55,2013-01-01 00:15:04,772.0,Y,828,2013-01-01 00:34:54,2013-01-01 00:53:02,2013-01-01 01:20:28,82,False
2013,1,130010006,2013-01-01 00:01:26,BRONX,10457,INJURY,5,INJURY,5,N,2013-01-01 00:01:58,B2,...,2013-01-01 00:02:55,2013-01-01 00:15:42,824.0,Y,856,2013-01-01 00:27:42,2013-01-01 00:31:13,2013-01-01 00:53:12,82,False


In [16]:
df.info(verbose=True,null_counts=True,memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8228125 entries, (2013, 1, 130010001) to (2018, 12, 183654386)
Data columns (total 22 columns):
incident_datetime                 8228125 non-null datetime64[ns]
borough                           8228125 non-null category
zipcode                           8078200 non-null category
initial_call_type                 8228125 non-null object
initial_severity_level_code       8228125 non-null int64
final_call_type                   8228125 non-null object
final_severity_level_code         8228125 non-null int64
held_indicator                    8228125 non-null category
first_assignment_datetime         8228125 non-null datetime64[ns]
incident_dispatch_area            8228125 non-null category
valid_dispatch_rspns_time_indc    8228125 non-null category
dispatch_response_seconds_qy      8228125 non-null int64
first_activation_datetime         8219527 non-null datetime64[ns]
first_on_scene_datetime           8228125 non-null datetime64[ns]
inc

In [17]:
df.memory_usage(deep=True)

Index                             115194263
incident_datetime                  65825000
borough                             8228675
zipcode                            16481184
initial_call_type                 509515230
initial_severity_level_code        65825000
final_call_type                   509611354
final_severity_level_code          65825000
held_indicator                      8228337
first_assignment_datetime          65825000
incident_dispatch_area              8231529
valid_dispatch_rspns_time_indc      8228271
dispatch_response_seconds_qy       65825000
first_activation_datetime          65825000
first_on_scene_datetime            65825000
incident_travel_tm_seconds_qy      65825000
valid_incident_rspns_time_indc      8228271
incident_response_seconds_qy       65825000
first_to_hosp_datetime             65825000
first_hosp_arrival_datetime        65825000
incident_close_datetime            65825000
incident_disposition_code           8229035
fatality                        

### Export dataframe to CSV

In [18]:
# Export dataframe to CSV
output_path = '../data/clean_EMS_data_from_api.csv'
print('Exporting dataframe to CSV...')
df.to_csv(output_path,index=False,compression='gzip')
print('Dataframe successfully exported to CSV using \'gzip\' compression.')

Exporting dataframe to CSV...
Dataframe successfully exported to CSV using 'gzip' compression.


###### Summary
The original dataset was comprised of 8,557,848 observations of mixed data types. The target variable ("fatality") was created by applying a boolean filter on the "incident_disposition_code" column within the dataframe, which indicates the outcome of any EMS incident. 

After all data pre-processing was complete, the resulting clean dataset was comprised of 8,228,125 observations of mixed data types, with a clear target variable and 21 predictor variables. Its output file occupied 292 MB of hard disk space and 1.9 GB in system memory.