In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
%matplotlib inline

In [2]:
import re
import yaml
import geopandas as gpd
import requests
import shutil
import pprint
from urllib.request import urlretrieve
from datetime import datetime
from zipfile import ZipFile

# Notebook Styling 
sns.set()
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))

DATA_DIR = os.path.join('..', 'data_raw')

In [51]:
from typing import (
    Dict,
#     List,
#     Tuple,
#     Set,
#     Deque,
#     NamedTuple,
#     IO,
#     Pattern,
#     Match,
#     Text,
#     Optional,
#     Sequence,
#     Iterable,
#     Mapping,
#     MutableMapping,
#     Any,
)

In [3]:
def ensure_dir_exists(DIR_PATH):
    if not os.path.isdir(DIR_PATH):
        os.makedirs(DIR_PATH)
        
def freq_label_selector(freq):
    if freq.lower() == 'm':
        return 'Month'
    elif freq.lower() == 'q':
        return 'Quarter'
    else:
        return 'Year'

In [4]:
def get_nypd_data_table_urls():
    YAML_DIR = os.path.join('..', 'yaml_files')
    ensure_dir_exists(YAML_DIR)
    yaml_path = os.path.join(YAML_DIR, 'nypd_data_urls.yaml')
    
    if not os.path.isfile(yaml_path):
        print('Making first yaml data file')
        # Initial data load
        nyc_data_urls = {
            'nypd_shooting_historic':'https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD',
            'nypd_shooting_ytd':'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD',
            'nypd_arrest_historic':'https://data.cityofnewyork.us/api/views/8h9b-rp9u/rows.csv?accessType=DOWNLOAD',
            'nypd_arrest_ytd':'https://data.cityofnewyork.us/api/views/uip8-fykc/rows.csv?accessType=DOWNLOAD',
            'nypd_complaint_historic':'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD',
            'nypd_complaint_ytd':'https://data.cityofnewyork.us/api/views/5uac-w243/rows.csv?accessType=DOWNLOAD',
        }
        yaml.dump(data=nyc_data_urls, stream=open(yaml_path, 'w'))
    nyc_data_urls = yaml.load(stream=open(os.path.join(yaml_path), 'r'), Loader=yaml.FullLoader)
    return nyc_data_urls

In [5]:
nyc_data_urls = get_nypd_data_table_urls()

In [6]:
nyc_data_urls

{'nypd_arrest_historic': 'https://data.cityofnewyork.us/api/views/8h9b-rp9u/rows.csv?accessType=DOWNLOAD',
 'nypd_arrest_ytd': 'https://data.cityofnewyork.us/api/views/uip8-fykc/rows.csv?accessType=DOWNLOAD',
 'nypd_complaint_historic': 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD',
 'nypd_complaint_ytd': 'https://data.cityofnewyork.us/api/views/5uac-w243/rows.csv?accessType=DOWNLOAD',
 'nypd_shooting_historic': 'https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD',
 'nypd_shooting_ytd': 'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD'}

In [7]:
def get_age_of_data_pull(data_table_name: str, DATA_DIR: str = DATA_DIR, VERBOSE: bool = False) -> float:
    """ Checks to the raw_data file of the most recent data pull. If it exists, it returns
    the age of the file in days. If it doesn't exist, it returns 36500 days. """
    dir_path = os.path.join(DATA_DIR)
    file_path = os.path.join(dir_path, f"{data_table_name}.csv")
    ensure_dir_exists(dir_path)
    if (os.path.isfile(file_path)):
        file_datetime = datetime.fromtimestamp(os.path.getmtime(file_path))
        file_age = ((datetime.now() - file_datetime).seconds) / (24 * 60 * 60)
    else:
        file_age = 100 * 365
    if VERBOSE: 
        print(f"{file_age} (days)")
        print(f"{file_age / 365} (years)")
    return file_age

In [8]:
get_age_of_data_pull(data_table_name = 'nypd_shooting_historic')

0.9603472222222222

In [16]:
def pull_raw_data_table(data_table_name: str, max_days_before_update: int = 7, 
                        DATA_DIR: str = DATA_DIR) -> None:
    data_urls = get_nypd_data_table_urls()
    if data_table_name in data_urls.keys():
        file_path = os.path.join(DATA_DIR, f'{data_table_name}.csv')
        file_age = get_age_of_data_pull(data_table_name = data_table_name)
        if file_age > max_days_before_update:
            urlretrieve(data_urls[data_table_name], file_path)
            print(f"{data_table_name} data successfully downloaded")
        else:
            print(f"{data_table_name} was pulled {round(file_age, 2)} days ago.")
            print(f"Reduce the max_days_before_update parameter (currently {max_days_before_update} days) to pull now.")
    else:
        print(f'No known URL for the (alleged) data table with the name {data_table_name}.')

In [17]:
pull_raw_data_table(data_table_name = 'nypd_shooting_historic')

nypd_shooting_historic was pulled 0.96 days ago.
Reduce the max_days_before_update parameter (currently 7 days) to pull now.


In [18]:
pull_raw_data_table(data_table_name = 'nypd_shooting_ytd')

nypd_shooting_ytd was pulled 0.96 days ago.
Reduce the max_days_before_update parameter (currently 7 days) to pull now.


In [19]:
pull_raw_data_table(data_table_name = 'nypd_arrest_historic')

nypd_arrest_historic was pulled 0.96 days ago.
Reduce the max_days_before_update parameter (currently 7 days) to pull now.


In [20]:
pull_raw_data_table(data_table_name = 'nypd_arrest_ytd')

nypd_arrest_ytd was pulled 0.96 days ago.
Reduce the max_days_before_update parameter (currently 7 days) to pull now.


In [21]:
pull_raw_data_table(data_table_name = 'nypd_complaint_historic')

nypd_complaint_historic was pulled 0.94 days ago.
Reduce the max_days_before_update parameter (currently 7 days) to pull now.


In [22]:
pull_raw_data_table(data_table_name = 'nypd_complaint_ytd')

nypd_complaint_ytd data successfully downloaded


In [23]:
def load_data_table(data_table_name: str, DATA_DIR: str = DATA_DIR) -> pd.DataFrame:
    data_urls = get_nypd_data_table_urls()
    if data_table_name in data_urls.keys():
        file_path = os.path.join(DATA_DIR, f'{data_table_name}.csv')
        if os.path.isfile(file_path):
            return pd.read_csv(file_path)
        else:
            print(f"{data_table_name} hasn't been pulled. Call pull_raw_data_table() then retry.")
    else:
        print(f'No known URL or file for the (alleged) data table with the name {data_table_name}.')    

In [None]:
arrest_dtypes = {
    'ARREST_KEY':str, 
    'PD_CD':str, 
    'PD_DESC':str, 
    'KY_CD':str, 
    'ARREST_PRECINCT':str,
    'JURISDICTION_CODE':str
}

In [63]:
def get_nypd_data_table_dtypes(data_table_name: str) -> Dict:
    YAML_DIR = os.path.join('..', 'yaml_files')
    ensure_dir_exists(YAML_DIR)
    yaml_path = os.path.join(YAML_DIR, 'nypd_data_table_dtypes.yaml')
    
    if not os.path.isfile(yaml_path):
        print('Making first yaml data file')
        # Initial data load
        nyc_data_dtypes = {
            'arrest_dtypes':{
                'ARREST_KEY':str, 
                'PD_CD':str, 
                'PD_DESC':str, 
                'KY_CD':str, 
                'ARREST_PRECINCT':str,
                'JURISDICTION_CODE':str
            }
        }        
        yaml.dump(data=nyc_data_dtypes, stream=open(yaml_path, 'w+'))
    nyc_data_dtypes = yaml.load(stream=open(os.path.join(yaml_path), 'r'), Loader=yaml.FullLoader)
    dtype_map = get_nypd_data_table_dtype_map(data_table_name=data_table_name)
    return nyc_data_dtypes[dtype_map[data_table_name]]

In [58]:
def get_nypd_data_table_dtype_map(data_table_name: str) -> Dict:
    YAML_DIR = os.path.join('..', 'yaml_files')
    ensure_dir_exists(YAML_DIR)
    yaml_path = os.path.join(YAML_DIR, 'nypd_data_table_dtype_map.yaml')
    
    if not os.path.isfile(yaml_path):
        print('Making first yaml data file')
        # Initial data load
        nyc_table_dtype_map = {
            'nypd_arrest_historic':'arrest_dtypes',
            'nypd_arrest_ytd':'arrest_dtypes',
            'nypd_complaint_historic':'complaint_dtypes',
            'nypd_complaint_ytd':'complaint_dtypes',
            'nypd_shooting_historic':'shooting_dtypes',
            'nypd_shooting_ytd':'shooting_dtypes',                
        }
        yaml.dump(data=nyc_table_dtype_map, stream=open(yaml_path, 'w+'))
    nyc_table_dtype_map = yaml.load(stream=open(os.path.join(yaml_path), 'r'), Loader=yaml.FullLoader)
    return nyc_table_dtype_map

In [6]:
nyc_data_urls

{'nypd_arrest_historic': 'https://data.cityofnewyork.us/api/views/8h9b-rp9u/rows.csv?accessType=DOWNLOAD',
 'nypd_arrest_ytd': 'https://data.cityofnewyork.us/api/views/uip8-fykc/rows.csv?accessType=DOWNLOAD',
 'nypd_complaint_historic': 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD',
 'nypd_complaint_ytd': 'https://data.cityofnewyork.us/api/views/5uac-w243/rows.csv?accessType=DOWNLOAD',
 'nypd_shooting_historic': 'https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD',
 'nypd_shooting_ytd': 'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD'}

In [64]:
nyc_data_dtypes = get_nypd_data_table_dtypes(data_table_name='nypd_arrest_historic')
nyc_data_dtypes

{'nypd_arrest_historic': 'arrest_dtypes', 'nypd_arrest_ytd': 'arrest_dtypes', 'nypd_complaint_historic': 'complaint_dtypes', 'nypd_complaint_ytd': 'complaint_dtypes', 'nypd_shooting_historic': 'shooting_dtypes', 'nypd_shooting_ytd': 'shooting_dtypes'}


{'ARREST_KEY': str,
 'ARREST_PRECINCT': str,
 'JURISDICTION_CODE': str,
 'KY_CD': str,
 'PD_CD': str,
 'PD_DESC': str}

In [59]:
get_nypd_data_table_dtype_map(data_table_name = 'nypd_arrest_historic')

Making first yaml data file


{'nypd_arrest_historic': 'arrest_dtypes',
 'nypd_arrest_ytd': 'arrest_dtypes',
 'nypd_complaint_historic': 'complaint_dtypes',
 'nypd_complaint_ytd': 'complaint_dtypes',
 'nypd_shooting_historic': 'shooting_dtypes',
 'nypd_shooting_ytd': 'shooting_dtypes'}

In [27]:
nypd_arrest_historic_df = load_data_table(data_table_name = 'nypd_arrest_historic')

In [28]:
nypd_arrest_ytd_df = load_data_table(data_table_name = 'nypd_arrest_ytd')

In [29]:
nypd_arrest_historic_df.head(2)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,144026181,06/26/2015,639.0,AGGRAVATED HARASSMENT 2,361.0,OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV,PL 2403002,M,Q,102,0.0,45-64,M,WHITE HISPANIC,1031076.0,193779.0,40.69844,-73.83113,POINT (-73.83112953899997 40.69843969400005)
1,144507595,07/14/2015,969.0,"TRAFFIC,UNCLASSIFIED INFRACTION",881.0,OTHER TRAFFIC INFRACTION,VTL051101A,M,M,10,3.0,25-44,M,WHITE HISPANIC,984791.0,209846.0,40.742664,-73.998049,POINT (-73.99804910799998 40.74266360800004)


In [30]:
nypd_arrest_ytd_df.head(2)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,210514390,03/04/2020,168.0,SODOMY 1,116.0,SEX CRIMES,PL 1305001,F,M,10,0,25-44,M,WHITE,984623,209859,40.742699,-73.998655,POINT (-73.99865537999995 40.74269929900004)
1,210932725,03/11/2020,177.0,SEXUAL ABUSE,116.0,SEX CRIMES,PL 1306501,F,Q,113,0,25-44,F,WHITE,1046367,186986,40.6797,-73.776047,POINT (-73.77604736799998 40.67970040800003)


In [35]:
[hist_col == ytd_call for hist_col, ytd_call in zip(nypd_arrest_historic_df, nypd_arrest_ytd_df)]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False]

In [42]:
nypd_arrest_ytd_df.columns = nypd_arrest_historic_df.columns
nypd_arrest_df = pd.concat([nypd_arrest_historic_df, nypd_arrest_ytd_df])
nypd_arrest_df.reset_index(drop=True, inplace=True)

In [43]:
nypd_arrest_df.head(2)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,144026181,06/26/2015,639.0,AGGRAVATED HARASSMENT 2,361.0,OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV,PL 2403002,M,Q,102,0.0,45-64,M,WHITE HISPANIC,1031076.0,193779.0,40.69844,-73.83113,POINT (-73.83112953899997 40.69843969400005)
1,144507595,07/14/2015,969.0,"TRAFFIC,UNCLASSIFIED INFRACTION",881.0,OTHER TRAFFIC INFRACTION,VTL051101A,M,M,10,3.0,25-44,M,WHITE HISPANIC,984791.0,209846.0,40.742664,-73.998049,POINT (-73.99804910799998 40.74266360800004)


In [44]:
nypd_arrest_df['ARREST_DATE'] = pd.to_datetime(nypd_arrest_df['ARREST_DATE'], format='%m/%d/%Y')

In [50]:
nypd_arrest_df.columns

Index(['ARREST_KEY', 'ARREST_DATE', 'PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC',
       'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'ARREST_PRECINCT',
       'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'X_COORD_CD',
       'Y_COORD_CD', 'Latitude', 'Longitude', 'Lon_Lat'],
      dtype='object')

In [45]:
nypd_arrest_df.head(2)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,144026181,2015-06-26,639.0,AGGRAVATED HARASSMENT 2,361.0,OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV,PL 2403002,M,Q,102,0.0,45-64,M,WHITE HISPANIC,1031076.0,193779.0,40.69844,-73.83113,POINT (-73.83112953899997 40.69843969400005)
1,144507595,2015-07-14,969.0,"TRAFFIC,UNCLASSIFIED INFRACTION",881.0,OTHER TRAFFIC INFRACTION,VTL051101A,M,M,10,3.0,25-44,M,WHITE HISPANIC,984791.0,209846.0,40.742664,-73.998049,POINT (-73.99804910799998 40.74266360800004)


In [46]:
nypd_arrest_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5087740 entries, 0 to 5087739
Data columns (total 19 columns):
 #   Column             Dtype         
---  ------             -----         
 0   ARREST_KEY         int64         
 1   ARREST_DATE        datetime64[ns]
 2   PD_CD              float64       
 3   PD_DESC            object        
 4   KY_CD              float64       
 5   OFNS_DESC          object        
 6   LAW_CODE           object        
 7   LAW_CAT_CD         object        
 8   ARREST_BORO        object        
 9   ARREST_PRECINCT    int64         
 10  JURISDICTION_CODE  float64       
 11  AGE_GROUP          object        
 12  PERP_SEX           object        
 13  PERP_RACE          object        
 14  X_COORD_CD         float64       
 15  Y_COORD_CD         float64       
 16  Latitude           float64       
 17  Longitude          float64       
 18  Lon_Lat            object        
dtypes: datetime64[ns](1), float64(7), int64(2), object(9)
memory us

In [47]:
nypd_arrest_df['PD_CD'].isnull().sum()

269

In [49]:
nypd_arrest_df['PD_CD'].value_counts(dropna=False)

567.0    423472
101.0    409627
478.0    308145
511.0    289208
849.0    226941
          ...  
713.0         1
790.0         1
631.0         1
600.0         1
178.0         1
Name: PD_CD, Length: 335, dtype: int64

In [48]:
nypd_arrest_df.loc[nypd_arrest_df['PD_CD'].isnull()]

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
991,10837169,2006-04-02,,,,,,,Q,101,0.0,,M,BLACK,1051775.0,159727.0,40.604841,-73.756823,POINT (-73.75682250899997 40.604840985000074)
7932,189850062,2018-11-11,,,,,,,K,73,97.0,25-44,M,BLACK,1010482.0,185602.0,40.676081,-73.905431,POINT (-73.90543088799996 40.676080719000026)
14848,189476256,2018-11-02,,,,,,,M,7,0.0,25-44,M,WHITE HISPANIC,988702.0,200012.0,40.715671,-73.983942,POINT (-73.98394174399994 40.71567057300007)
22259,188601148,2018-10-10,,,,,,,B,43,0.0,25-44,M,BLACK,1017998.0,239446.0,40.823843,-73.878064,POINT (-73.87806433299994 40.82384329800004)
23700,188405535,2018-10-05,,,,,,,B,41,0.0,25-44,M,WHITE HISPANIC,1013086.0,236614.0,40.816088,-73.895824,POINT (-73.89582435399994 40.81608766100004)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5025197,212182277,2020-04-20,,,,,PL 2650022,M,Q,101,0.0,45-64,M,BLACK,1053650.0,158969.0,40.602746,-73.750078,POINT (-73.75007786499998 40.60274595100003)
5027025,214332259,2020-06-18,,,,,PL 2410203,M,Q,106,0.0,45-64,F,ASIAN / PACIFIC ISLANDER,1031629.0,177604.0,40.654040,-73.829249,POINT (-73.82924892499994 40.65404010100008)
5031235,214565344,2020-06-25,,,,,PL 2650022,M,B,42,0.0,25-44,F,BLACK,1008691.0,238872.0,40.822298,-73.911694,POINT (-73.91169413099993 40.82229848500003)
5046153,211476100,2020-03-25,,,,,PL 2650022,M,M,33,0.0,25-44,M,BLACK HISPANIC,1001992.0,247191.0,40.845148,-73.935876,POINT (-73.93587615099995 40.84514773600006)
