# Capstone 1: Data Wrangling

### Get source data (from CSV)

In [66]:
# Import packages and modules
import pandas as pd
import numpy as np
import datetime
import sqlite3
from sqlalchemy import create_engine

import dask.dataframe as dd

In [67]:
# Assign file name
file_path = 'data/EMS_Incident_Dispatch_Data.csv'

# Read CSV data into a Pandas dataframe
chunk = 100000
for i in range(10):
    temp_df = pd.read_csv(file_path, header=0, nrows=chunk, skiprows=chunk*i)
#df = dd.read_csv(file_path)

cols = [name.lower() for name in list(df.columns)]
df.columns = cols

  interactivity=interactivity, compiler=compiler, result=result)


In [69]:
df.dtypes

cad_incident_id                     int64
incident_datetime                  object
initial_call_type                  object
initial_severity_level_code         int64
final_call_type                    object
final_severity_level_code           int64
first_assignment_datetime          object
valid_dispatch_rspns_time_indc     object
dispatch_response_seconds_qy        int64
first_activation_datetime          object
first_on_scene_datetime            object
valid_incident_rspns_time_indc     object
incident_response_seconds_qy      float64
incident_travel_tm_seconds_qy     float64
first_to_hosp_datetime             object
first_hosp_arrival_datetime        object
incident_close_datetime            object
held_indicator                     object
incident_disposition_code         float64
borough                            object
atom                               object
incident_dispatch_area             object
zipcode                           float64
policeprecinct                    

### Clean data

In [70]:
# Define function to modify dtypes of series in dataframe object
def modify_dtype(df):
    """
    Change the dtype of select columns in 
    dataframe object based on its implied value
    """
    
    # Create list of all columns that contain time duration
    list_of_duration_cols = [name for name in list(df.columns) \
                             if 'seconds' in str(name).lower()]

    # Convert dtypes for each element in list to numeric
    for name in list_of_duration_cols:
        df[name]=pd.to_numeric(df[name])
        
    # Create list of all columns that contain ISO8601 datetime
    list_of_datetime_cols = [name for name in list(df.columns) \
                             if 'datetime' in str(name).lower()]

    # Convert dtypes for each element in list to datetime
    for name in list_of_datetime_cols:
        df[name]=pd.to_datetime(df[name])
    
    return df

In [71]:
# Define function to drop columns from dataframe object
def drop_from_df(df):
    """
    Remove columns from dataframe object 
    that are not required for analysis
    """
    
    # Drop all rows with missing value for 'incident_disposition_code'
    df = df.dropna(subset=['incident_disposition_code'])
    
    # Drop all columns that contain district or precinct data (geographic zones)
    list_of_zone_cols = [name for name in list(df.columns) \
                         if ('district' in str(name).lower() or name=='policeprecinct')]
    df.drop(list_of_zone_cols,axis=1)
    
    # Drop all columns that contain incident indicator data
    list_of_indicator_cols = [name for name in list(df.columns) \
                              if 'indicator' in str(name).lower() and name !='held_indicator']
    df.drop(list_of_indicator_cols,axis=1)
    
    return df

In [72]:
# Clean dataframe object for analysis
df = modify_dtype(df)
df = drop_from_df(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98782 entries, 0 to 99999
Data columns (total 32 columns):
cad_incident_id                   98782 non-null int64
incident_datetime                 98782 non-null datetime64[ns]
initial_call_type                 98782 non-null object
initial_severity_level_code       98782 non-null int64
final_call_type                   98782 non-null object
final_severity_level_code         98782 non-null int64
first_assignment_datetime         98782 non-null datetime64[ns]
valid_dispatch_rspns_time_indc    98782 non-null object
dispatch_response_seconds_qy      98782 non-null int64
first_activation_datetime         98571 non-null datetime64[ns]
first_on_scene_datetime           96683 non-null datetime64[ns]
valid_incident_rspns_time_indc    98782 non-null object
incident_response_seconds_qy      96679 non-null float64
incident_travel_tm_seconds_qy     96683 non-null float64
first_to_hosp_datetime            73498 non-null datetime64[ns]
first_hosp_ar

In [73]:
df.dtypes

cad_incident_id                            int64
incident_datetime                 datetime64[ns]
initial_call_type                         object
initial_severity_level_code                int64
final_call_type                           object
final_severity_level_code                  int64
first_assignment_datetime         datetime64[ns]
valid_dispatch_rspns_time_indc            object
dispatch_response_seconds_qy               int64
first_activation_datetime         datetime64[ns]
first_on_scene_datetime           datetime64[ns]
valid_incident_rspns_time_indc            object
incident_response_seconds_qy             float64
incident_travel_tm_seconds_qy            float64
first_to_hosp_datetime            datetime64[ns]
first_hosp_arrival_datetime       datetime64[ns]
incident_close_datetime           datetime64[ns]
held_indicator                            object
incident_disposition_code                float64
borough                                   object
atom                

### Restructure core dataframe object

In [76]:
# CREATE PANDAS SERIES FOR TARGET VARIABLE: patient_died
df['patient_died'] = np.logical_or(df['incident_disposition_code'] == 83,df['incident_disposition_code'] == 96)

# Create separate columns for the year and month of the incident
df['incident_year'] = pd.DatetimeIndex(df['incident_datetime']).year
df['incident_month'] = pd.DatetimeIndex(df['incident_datetime']).month

# Reorder dataframe columns
col_order = ['incident_year','incident_month','cad_incident_id',\
             'incident_datetime','borough','zipcode',\
             'incident_dispatch_area','held_indicator',\
             'initial_call_type','initial_severity_level_code',\
             'final_call_type','final_severity_level_code',\
             'first_assignment_datetime','valid_dispatch_rspns_time_indc',\
             'dispatch_response_seconds_qy','first_activation_datetime',\
             'first_on_scene_datetime','valid_incident_rspns_time_indc',\
             'incident_response_seconds_qy','incident_travel_tm_seconds_qy',\
             'first_to_hosp_datetime','first_hosp_arrival_datetime',\
             'incident_close_datetime',
             'incident_disposition_code','patient_died']
df=df[col_order]

# Create a multindex on incident_year and incident_month
df.set_index(['incident_year','incident_month'])

Unnamed: 0_level_0,Unnamed: 1_level_0,cad_incident_id,incident_datetime,borough,zipcode,incident_dispatch_area,held_indicator,initial_call_type,initial_severity_level_code,final_call_type,final_severity_level_code,...,first_activation_datetime,first_on_scene_datetime,valid_incident_rspns_time_indc,incident_response_seconds_qy,incident_travel_tm_seconds_qy,first_to_hosp_datetime,first_hosp_arrival_datetime,incident_close_datetime,incident_disposition_code,patient_died
incident_year,incident_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013,1,130010001,2013-01-01 00:00:04,BRONX,10472.0,B3,N,RESPIR,4,RESPIR,4,...,2013-01-01 00:01:51,2013-01-01 00:13:21,Y,797.0,696.0,2013-01-01 00:28:49,2013-01-01 00:38:15,2013-01-01 01:04:56,82.0,False
2013,1,130010002,2013-01-01 00:00:19,BRONX,10454.0,B1,N,CARD,3,CARD,3,...,2013-01-01 00:02:08,2013-01-01 00:14:30,Y,851.0,792.0,NaT,NaT,2013-01-01 00:55:34,93.0,False
2013,1,130010004,2013-01-01 00:01:04,QUEENS,11418.0,Q3,N,ARREST,1,ARREST,1,...,2013-01-01 00:01:58,2013-01-01 00:08:13,Y,429.0,400.0,NaT,NaT,2013-01-01 00:38:05,83.0,True
2013,1,130010005,2013-01-01 00:01:16,BRONX,10453.0,B2,N,SICK,6,SICK,6,...,2013-01-01 00:02:55,2013-01-01 00:15:04,Y,828.0,772.0,2013-01-01 00:34:54,2013-01-01 00:53:02,2013-01-01 01:20:28,82.0,False
2013,1,130010006,2013-01-01 00:01:26,BRONX,10457.0,B2,N,INJURY,5,INJURY,5,...,2013-01-01 00:02:55,2013-01-01 00:15:42,Y,856.0,824.0,2013-01-01 00:27:42,2013-01-01 00:31:13,2013-01-01 00:53:12,82.0,False
2013,1,130010007,2013-01-01 00:01:35,MANHATTAN,10017.0,M3,N,ASTHMB,2,ASTHMB,2,...,2013-01-01 00:01:59,2013-01-01 00:05:39,Y,244.0,228.0,NaT,NaT,2013-01-01 00:47:48,93.0,False
2013,1,130010008,2013-01-01 00:01:52,RICHMOND / STATEN ISLAND,10306.0,S2,N,INJURY,5,INJURY,5,...,2013-01-01 00:02:11,2013-01-01 00:08:07,Y,375.0,362.0,NaT,NaT,2013-01-01 00:41:52,93.0,False
2013,1,130010009,2013-01-01 00:02:11,RICHMOND / STATEN ISLAND,10301.0,S1,N,EDP,7,DRUG,4,...,2013-01-01 00:02:36,2013-01-01 00:09:19,Y,428.0,416.0,2013-01-01 00:31:05,2013-01-01 00:38:14,2013-01-01 01:19:11,82.0,False
2013,1,130010010,2013-01-01 00:02:56,MANHATTAN,10013.0,M1,N,UNC,2,UNC,2,...,2013-01-01 00:03:27,2013-01-01 00:08:19,Y,323.0,304.0,2013-01-01 00:40:43,2013-01-01 00:49:13,2013-01-01 01:27:51,82.0,False
2013,1,130010011,2013-01-01 00:03:12,BRONX,10467.0,B4,N,DIFFBR,2,DIFFBR,2,...,2013-01-01 00:03:45,2013-01-01 00:06:04,Y,172.0,147.0,2013-01-01 00:25:19,2013-01-01 00:33:26,2013-01-01 01:29:46,82.0,False


In [77]:
df['incident_disposition_code'].value_counts()

82.0    73428
93.0    10067
90.0     7282
87.0     3310
96.0     2769
91.0      964
83.0      860
94.0       93
95.0        8
92.0        1
Name: incident_disposition_code, dtype: int64

In [78]:
df['patient_died'].value_counts()

False    95153
True      3629
Name: patient_died, dtype: int64

In [None]:
# Export dataframe to CSV
df.to_csv('data/clean_df.csv',compression='gzip')

### Create dataframes for each month

In [82]:
# Define function to create month-specific dataframe
def getIncidentsByMonth(month,df):
    """
    Create a new dataframe from a core dataframe 
    sliced by the month where an incident occured
    """
    month_df = df.loc[(1,1),:]    # Select all records for specified month
    return month_df

In [84]:
jan_df = df.iloc[(slice(None),1):]
jan_df.tail()

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.numeric.Int64Index'> with these indexers [(slice(None, None, None), 1)] of <class 'tuple'>