In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
%matplotlib inline

In [15]:
import re
import yaml
import geopandas as gpd
import requests
import shutil
import pprint
from urllib.request import urlretrieve
from datetime import datetime
from zipfile import ZipFile

# Notebook Styling 
sns.set()
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))

DATA_DIR = os.path.join('..', 'data_raw')

In [None]:
# from typing import (
#     Dict,
#     List,
#     Tuple,
#     Set,
#     Deque,
#     NamedTuple,
#     IO,
#     Pattern,
#     Match,
#     Text,
#     Optional,
#     Sequence,
#     Iterable,
#     Mapping,
#     MutableMapping,
#     Any,
# )

In [3]:
def ensure_dir_exists(DIR_PATH):
    if not os.path.isdir(DIR_PATH):
        os.makedirs(DIR_PATH)
        
def freq_label_selector(freq):
    if freq.lower() == 'm':
        return 'Month'
    elif freq.lower() == 'q':
        return 'Quarter'
    else:
        return 'Year'

In [20]:
def get_nypd_data_table_urls():
    YAML_DIR = os.path.join('..', 'yaml_files')
    ensure_dir_exists(YAML_DIR)
    yaml_path = os.path.join(YAML_DIR, 'nypd_data_urls.yaml')
    
    if not os.path.isfile(yaml_path):
        print('Making first yaml data file')
        # Initial data load
        nyc_data_urls = {
            'nypd_shooting_historic':'https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD',
            'nypd_shooting_ytd':'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD',
            'nypd_arrest_historic':'https://data.cityofnewyork.us/api/views/8h9b-rp9u/rows.csv?accessType=DOWNLOAD',
            'nypd_arrest_ytd':'https://data.cityofnewyork.us/api/views/uip8-fykc/rows.csv?accessType=DOWNLOAD',
            'nypd_complaint_historic':'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD',
            'nypd_complaint_ytd':'https://data.cityofnewyork.us/api/views/5uac-w243/rows.csv?accessType=DOWNLOAD',
        }
        yaml.dump(data=nyc_data_urls, stream=open(yaml_path, 'w'))
    nyc_data_urls = yaml.load(stream=open(os.path.join(yaml_path), 'r'), Loader=yaml.FullLoader)
    return nyc_data_urls

In [22]:
nyc_data_urls = get_nypd_data_table_urls()

In [23]:
nyc_data_urls

{'nypd_arrest_historic': 'https://data.cityofnewyork.us/api/views/8h9b-rp9u/rows.csv?accessType=DOWNLOAD',
 'nypd_arrest_ytd': 'https://data.cityofnewyork.us/api/views/uip8-fykc/rows.csv?accessType=DOWNLOAD',
 'nypd_complaint_historic': 'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD',
 'nypd_complaint_ytd': 'https://data.cityofnewyork.us/api/views/5uac-w243/rows.csv?accessType=DOWNLOAD',
 'nypd_shooting_historic': 'https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD',
 'nypd_shooting_ytd': 'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD'}

In [43]:
def get_age_of_data_pull(data_table_name: str, DATA_DIR: str = DATA_DIR, VERBOSE: bool = False) -> float:
    """ Checks to the raw_data file of the most recent data pull. If it exists, it returns
    the age of the file in days. If it doesn't exist, it returns 36500 days. """
    dir_path = os.path.join(DATA_DIR)
    file_path = os.path.join(dir_path, f"{data_table_name}.csv")
    ensure_dir_exists(dir_path)
    if (os.path.isfile(file_path)):
        file_datetime = datetime.fromtimestamp(os.path.getmtime(file_path))
        file_age = ((datetime.now() - file_datetime).seconds) / (24 * 60 * 60)
    else:
        file_age = 100 * 365
    if VERBOSE: 
        print(f"{file_age} (days)")
        print(f"{file_age / 365} (years)")
    return file_age

In [45]:
get_age_of_data_pull(data_table_name = 'nypd_shooting_historic')

0.01974537037037037

In [48]:
def pull_raw_data_table(data_table_name: str, max_days_before_update: int = 7, 
                        DATA_DIR: str = DATA_DIR) -> None:
    data_urls = get_nypd_data_table_urls()
    if data_table_name in data_urls.keys():
        file_path = os.path.join(DATA_DIR, f'{data_table_name}.csv')
        file_age = get_age_of_data_pull(data_table_name = data_table_name)
        if file_age > max_days_before_update:
            urlretrieve(data_urls[data_table_name], file_path)
            print(f"{data_table_name} data successfully downloaded")
    else:
        print(f'No known URL for the (alleged) data table with the name {data_table_name}.')

In [32]:
pull_raw_data_table(data_table_name = 'nypd_shooting_ytd')

nypd_shooting_ytd data successfully downloaded


In [34]:
pull_raw_data_table(data_table_name = 'nypd_arrest_historic')

nypd_arrest_historic data successfully downloaded


In [36]:
pull_raw_data_table(data_table_name = 'nypd_arrest_ytd')

nypd_arrest_ytd data successfully downloaded


In [None]:
pull_raw_data_table(data_table_name = 'nypd_complaint_historic')

In [47]:
def load_data_table(data_table_name: str, DATA_DIR: str = DATA_DIR) -> pd.DataFrame:
    data_urls = get_nypd_data_table_urls()
    if data_table_name in data_urls.keys():
        file_path = os.path.join(DATA_DIR, f'{data_table_name}.csv')
        if os.path.isfile(file_path):
            return pd.read_csv(file_path)
        else:
            print(f"{data_table_name} hasn't been pulled. Call pull_raw_data_table() then retry.")
    else:
        print(f'No known URL or file for the (alleged) data table with the name {data_table_name}.')    

In [None]:
nypd_arrest_historic_df = load_data_table(data_table_name = 'nypd_arrest_historic')