# Capstone 1: Data Wrangling

### Data Acquisition: Get source data from NYC Open Data API

In [None]:
# Import packages and modules
import pandas as pd
import numpy as np
import datetime

In [None]:
# Import Python library for Socrata Open Data API
from sodapy import Socrata

frames = []
batch_size = 1000000

# Import dataset via context manager
for i in range(9):
    with Socrata("data.cityofnewyork.us", None) as client:
        results = client.get("66ae-7zpy", limit=batch_size,offset=i*batch_size)
    print('Preparing to load Batch {} to dataframe object...'.format(i+1))
    temp_df = pd.DataFrame.from_records(results)
    print('Batch {} loaded'.format(i+1))
    frames.append(temp_df)
    print('Dataframe object (df{}) appended to list\n'.format(i+1))
print('\nAll dataframe objects have been appended to list')

### Data Cleansing: Define necessary functions

In [None]:
# Define function to drop immaterial rows and columns from dataframe object
def drop_from_df(dfObj):
    """
    Remove columns from dataframe object 
    that are not required for analysis
    """
    # Drop all rows with missing value for 'incident_disposition_code'
    dfObj.dropna(subset=['incident_disposition_code'],inplace=True)
    
    # Identify all columns that contain incident indicator data
    list_of_indicator_cols = [name for name in list(dfObj.columns) 
                              if 'indicator' in str(name).lower() and name !='held_indicator']
    
    # Drop all rows that pertain to outlier incidents
    for name in list_of_indicator_cols:
        index_names = dfObj[dfObj[name]=='Y'].index
        dfObj.drop(index_names, inplace=True)
    
    # Remove columns that contain incident indicator data
    dfObj.drop(list_of_indicator_cols,axis=1,inplace=True)
    
    # Identify and remove all columns that contain district or precinct data (geographic zones)
    list_of_zone_cols = [name for name in list(dfObj.columns) 
                         if ('district' in str(name).lower() or name=='policeprecinct')]
    dfObj.drop(list_of_zone_cols,axis=1,inplace=True)
    
    return dfObj

In [None]:
# Define function to modify dtypes of series in dataframe object
def modify_dtype(dfObj):
    """
    Change the dtype of select columns in 
    dataframe object based on its implied value
    """
    # Create list of all columns that contain ISO8601 datetime
    list_of_datetime_cols = [name for name in list(dfObj.columns) 
                             if 'datetime' in str(name).lower()]

    # Convert dtypes for each element in list to datetime
    for name in list_of_datetime_cols:
        dfObj[name]=pd.to_datetime(dfObj[name])
       
    # Create list of all columns that contain time duration
    list_of_duration_cols = [name for name in list(dfObj.columns) 
                             if 'seconds' in str(name).lower()]

    # Convert dtypes for each element in list to numeric
    for name in list_of_duration_cols:
        dfObj[name]=pd.to_numeric(dfObj[name])
        
    # Convert columns to category dtypes to reduce size of dataframe object
    dfObj['borough'] = dfObj.borough.astype('category')
    dfObj['held_indicator'] = dfObj.held_indicator.astype('category')
    dfObj['valid_dispatch_rspns_time_indc'] = dfObj.valid_dispatch_rspns_time_indc.astype('category')
    dfObj['valid_incident_rspns_time_indc'] = dfObj.valid_incident_rspns_time_indc.astype('category')
    dfObj['incident_dispatch_area'] = dfObj.incident_dispatch_area.astype('category')
        
    return dfObj

In [None]:
def redesign_df(dfObj):
    """
    Restructure dataframe object
    """
    # CREATE PANDAS SERIES FOR TARGET VARIABLE: fatality
    dfObj['fatality'] = np.logical_or(dfObj.incident_disposition_code == '83',\
                                   dfObj.incident_disposition_code == '96')

    # Create separate columns for the year and month of the incident
    dfObj['incident_year'] = pd.DatetimeIndex(dfObj.incident_datetime).year
    dfObj['incident_month'] = pd.DatetimeIndex(dfObj.incident_datetime).month

    # Reorder dataframe columns
    col_order = ['incident_year','incident_month','cad_incident_id',\
                 'incident_datetime','borough','zipcode',\
                 'incident_dispatch_area','held_indicator',\
                 'initial_call_type','initial_severity_level_code',\
                 'final_call_type','final_severity_level_code',\
                 'first_assignment_datetime','valid_dispatch_rspns_time_indc',\
                 'dispatch_response_seconds_qy','first_activation_datetime',\
                 'first_on_scene_datetime','valid_incident_rspns_time_indc',\
                 'incident_response_seconds_qy','incident_travel_tm_seconds_qy',\
                 'first_to_hosp_datetime','first_hosp_arrival_datetime',\
                 'incident_close_datetime',
                 'incident_disposition_code','fatality']
    dfObj=dfObj[col_order]
    
    # Create a multindex on incident_year and incident_month
    dfObj.set_index(['incident_year','incident_month','cad_incident_id'])

    return dfObj

### Data Cleansing: Apply functions to dataset

In [None]:
# Apply drop_from_df to all dataframe objects in frames
count = 1
for frame in frames:
    frame = drop_from_df(frame)
    print('Dataframe {}: Removed rows and columns'.format(count))
    count += 1

In [None]:
# Apply modify_dtype to all dataframe objects in frames
count = 1
for frame in frames:
    frame = modify_dtype(frame)
    print("Dataframe {}: Modified dtypes".format(count))
    count +=1

In [None]:
# Apply redesign_df to all dataframe objects in frames
count = 1
for frame in frames:
    frame = redesign_df(frame)
    print("Dataframe {}: Redesigned df object".format(count))
    count +=1

### Merge all dataframe objects

In [None]:
# Concatenate all dataframe objects
df = pd.concat(frames,ignore_index=True)
print('Concatenated all dataframe objects in frames')

In [None]:
# Create a multindex on incident_year, incident_month, and cad_incident_id
df.set_index(['incident_year','incident_month','cad_incident_id'])

In [None]:
df.shape

In [None]:
# Export dataframe to CSV
print('Exporting dataframe to CSV...')
df.to_csv('data/clean_comp_df.csv',index=False,compression='gzip')
print('Dataframe successfully exported to CSV using \'gzip\' compression.')