# Capstone 1: Data Wrangling

### Data Acquisition: Get source data from NYC Open Data API

In [1]:
# Import packages and modules
import pandas as pd
import numpy as np
import datetime

In [2]:
# Import Python library for Socrata Open Data API
from sodapy import Socrata

# Import dataset via context manager 
with Socrata("data.cityofnewyork.us", None) as client:
    results = client.get("66ae-7zpy", limit=2000000,offset=0)

df = pd.DataFrame.from_records(results)



### Data Cleansing

In [3]:
# Define function to modify dtypes of series in dataframe object
def modify_dtype(df):
    """
    Change the dtype of select columns in 
    dataframe object based on its implied value
    """
    
    # Create list of all columns that contain time duration
    list_of_duration_cols = [name for name in list(df.columns) 
                             if 'seconds' in str(name).lower()]

    # Convert dtypes for each element in list to numeric
    for name in list_of_duration_cols:
        df[name]=pd.to_numeric(df[name])
        
    # Create list of all columns that contain ISO8601 datetime
    list_of_datetime_cols = [name for name in list(df.columns) 
                             if 'datetime' in str(name).lower()]

    # Convert dtypes for each element in list to datetime
    for name in list_of_datetime_cols:
        df[name]=pd.to_datetime(df[name])
    
    return df

In [4]:
# Define function to remove rows and columns from dataframe object
def drop_from_df(df):
    """
    Remove rows and columns from dataframe object 
    that are not required for analysis
    """
    
    # Drop all rows with missing value for 'incident_disposition_code'
    df.dropna(subset=['incident_disposition_code','incident_datetime'])
    
    # Drop all columns that contain district or precinct data (geographic zones)
    list_of_zone_cols = [name for name in list(df.columns) 
                         if ('district' in str(name).lower() or name=='policeprecinct')]
    df.drop(list_of_zone_cols,axis=1,inplace=True)
    
    # Drop all columns that contain incident indicator data
    list_of_indicator_cols = [name for name in list(df.columns) 
                              if 'indicator' in str(name).lower() and name !='held_indicator']
    df.drop(list_of_indicator_cols,axis=1,inplace=True)
    
    return df

In [5]:
# Define function to redesign dataframe object
def redesign_df(df):
    """
    Redesign dataframe object
    """
    # CREATE PANDAS SERIES FOR TARGET VARIABLE: patient_died
    df['patient_died'] = np.logical_or(df['incident_disposition_code'] == '83',\
                                   df['incident_disposition_code'] == '96')

    # Create separate columns for the year and month of the incident
    df['incident_year'] = pd.DatetimeIndex(df['incident_datetime']).year
    df['incident_month'] = pd.DatetimeIndex(df['incident_datetime']).month

    # Reorder dataframe columns
    col_order = ['incident_year','incident_month','cad_incident_id',\
                 'incident_datetime','borough','zipcode',\
                 'incident_dispatch_area','held_indicator',\
                 'initial_call_type','initial_severity_level_code',\
                 'final_call_type','final_severity_level_code',\
                 'first_assignment_datetime','valid_dispatch_rspns_time_indc',\
                 'dispatch_response_seconds_qy','first_activation_datetime',\
                 'first_on_scene_datetime','valid_incident_rspns_time_indc',\
                 'incident_response_seconds_qy','incident_travel_tm_seconds_qy',\
                 'first_to_hosp_datetime','first_hosp_arrival_datetime',\
                 'incident_close_datetime',
                 'incident_disposition_code','patient_died']
    df=df[col_order]
    
    return df

### Joining Data

In [6]:
# Clean dataframe object for analysis
df = modify_dtype(df)
print('Modified dtypes of series in dataframe object...')
df = drop_from_df(df)
print('Removed erroneous rows and columns from dataframe...')
df = redesign_df(df)
print('Redesigned dataframe...')

Modified dtypes of series in dataframe object...
Removed erroneous rows and columns from dataframe...
Dataframe redesigned...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 25 columns):
incident_year                     int64
incident_month                    int64
cad_incident_id                   object
incident_datetime                 datetime64[ns]
borough                           object
zipcode                           object
incident_dispatch_area            object
held_indicator                    object
initial_call_type                 object
initial_severity_level_code       object
final_call_type                   object
final_severity_level_code         object
first_assignment_datetime         datetime64[ns]
valid_dispatch_rspns_time_indc    object
dispatch_response_seconds_qy      int64
first_activation_datetime         datetime64[ns]
first_on_scene_datetime           datetime64[ns]
valid_incident_rspns_time_indc    object
incident_response_seconds_qy      float64
incident_travel_tm_seconds_qy     float64
first_to_hosp_datetime            da

In [8]:
# Create a multindex on incident_year and incident_month
df.set_index(['incident_year','incident_month'])

Unnamed: 0_level_0,Unnamed: 1_level_0,cad_incident_id,incident_datetime,borough,zipcode,incident_dispatch_area,held_indicator,initial_call_type,initial_severity_level_code,final_call_type,final_severity_level_code,...,first_activation_datetime,first_on_scene_datetime,valid_incident_rspns_time_indc,incident_response_seconds_qy,incident_travel_tm_seconds_qy,first_to_hosp_datetime,first_hosp_arrival_datetime,incident_close_datetime,incident_disposition_code,patient_died
incident_year,incident_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013,1,130010001,2013-01-01 00:00:04,BRONX,10472,B3,N,RESPIR,4,RESPIR,4,...,2013-01-01 00:01:51,2013-01-01 00:13:21,Y,797.0,696.0,2013-01-01 00:28:49,2013-01-01 00:38:15,2013-01-01 01:04:56,82,False
2013,1,130010002,2013-01-01 00:00:19,BRONX,10454,B1,N,CARD,3,CARD,3,...,2013-01-01 00:02:08,2013-01-01 00:14:30,Y,851.0,792.0,NaT,NaT,2013-01-01 00:55:34,93,False
2013,1,130010004,2013-01-01 00:01:04,QUEENS,11418,Q3,N,ARREST,1,ARREST,1,...,2013-01-01 00:01:58,2013-01-01 00:08:13,Y,429.0,400.0,NaT,NaT,2013-01-01 00:38:05,83,True
2013,1,130010005,2013-01-01 00:01:16,BRONX,10453,B2,N,SICK,6,SICK,6,...,2013-01-01 00:02:55,2013-01-01 00:15:04,Y,828.0,772.0,2013-01-01 00:34:54,2013-01-01 00:53:02,2013-01-01 01:20:28,82,False
2013,1,130010006,2013-01-01 00:01:26,BRONX,10457,B2,N,INJURY,5,INJURY,5,...,2013-01-01 00:02:55,2013-01-01 00:15:42,Y,856.0,824.0,2013-01-01 00:27:42,2013-01-01 00:31:13,2013-01-01 00:53:12,82,False
2013,1,130010007,2013-01-01 00:01:35,MANHATTAN,10017,M3,N,ASTHMB,2,ASTHMB,2,...,2013-01-01 00:01:59,2013-01-01 00:05:39,Y,244.0,228.0,NaT,NaT,2013-01-01 00:47:48,93,False
2013,1,130010008,2013-01-01 00:01:52,RICHMOND / STATEN ISLAND,10306,S2,N,INJURY,5,INJURY,5,...,2013-01-01 00:02:11,2013-01-01 00:08:07,Y,375.0,362.0,NaT,NaT,2013-01-01 00:41:52,93,False
2013,1,130010009,2013-01-01 00:02:11,RICHMOND / STATEN ISLAND,10301,S1,N,EDP,7,DRUG,4,...,2013-01-01 00:02:36,2013-01-01 00:09:19,Y,428.0,416.0,2013-01-01 00:31:05,2013-01-01 00:38:14,2013-01-01 01:19:11,82,False
2013,1,130010010,2013-01-01 00:02:56,MANHATTAN,10013,M1,N,UNC,2,UNC,2,...,2013-01-01 00:03:27,2013-01-01 00:08:19,Y,323.0,304.0,2013-01-01 00:40:43,2013-01-01 00:49:13,2013-01-01 01:27:51,82,False
2013,1,130010011,2013-01-01 00:03:12,BRONX,10467,B4,N,DIFFBR,2,DIFFBR,2,...,2013-01-01 00:03:45,2013-01-01 00:06:04,Y,172.0,147.0,2013-01-01 00:25:19,2013-01-01 00:33:26,2013-01-01 01:29:46,82,False


In [9]:
df['incident_disposition_code'].value_counts()

82    1430491
93     207216
90     163448
87      77672
96      64358
91      20769
83      14199
94       1723
95        183
92         12
Name: incident_disposition_code, dtype: int64

In [10]:
df['patient_died'].value_counts()

False    1921443
True       78557
Name: patient_died, dtype: int64

In [12]:
# Export dataframe to CSV
print('Exporting dataframe to CSV...')
df.to_csv('data/clean_comp_df.csv',index=False,compression='gzip')
print('Dataframe successfully exported to CSV.')