In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
%matplotlib inline

In [15]:
import re
import yaml
import geopandas as gpd
import requests
import shutil
import pprint
from urllib.request import urlretrieve
from datetime import datetime
from zipfile import ZipFile

# Notebook Styling 
sns.set()
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))

DATA_DIR = os.path.join('..', 'data_raw')

In [None]:
from typing import (
    Dict,
    List,
    Tuple,
    Set,
    Deque,
    NamedTuple,
    IO,
    Pattern,
    Match,
    Text,
    Optional,
    Sequence,
    Iterable,
    Mapping,
    MutableMapping,
    Any,
)

In [3]:
def ensure_dir_exists(DIR_PATH):
    if not os.path.isdir(DIR_PATH):
        os.makedirs(DIR_PATH)
        
def freq_label_selector(freq):
    if freq.lower() == 'm':
        return 'Month'
    elif freq.lower() == 'q':
        return 'Quarter'
    else:
        return 'Year'

In [4]:
nyc_data_urls = {
    'nypd_shooting_historic':{
        'url':'https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD',
        'filename':'nypd_shooting_historic.csv',
    },
    'nypd_shooting_ytd':{
        'url':'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD',
        'filename':'nypd_shooting_ytd.csv',
    }    
}

In [20]:
def get_nypd_data_table_urls():
    YAML_DIR = os.path.join('..', 'yaml_files')
    ensure_dir_exists(YAML_DIR)
    yaml_path = os.path.join(YAML_DIR, 'nypd_data_urls.yaml')
    
    if not os.path.isfile(yaml_path):
        print('Making first yaml data file')
        # Initial data load
        nyc_data_urls = {
            'nypd_shooting_historic':'https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD',
            'nypd_shooting_ytd':'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD',
            'nypd_arrest_historic':'https://data.cityofnewyork.us/api/views/8h9b-rp9u/rows.csv?accessType=DOWNLOAD',
            'nypd_arrest_ytd':'https://data.cityofnewyork.us/api/views/uip8-fykc/rows.csv?accessType=DOWNLOAD',
            'nypd_complaint_historic':'https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD',
            'nypd_complaint_ytd':'https://data.cityofnewyork.us/api/views/5uac-w243/rows.csv?accessType=DOWNLOAD',
        }
        yaml.dump(data=nyc_data_urls, stream=open(yaml_path, 'w'))
    nyc_data_urls = yaml.load(stream=open(os.path.join(yaml_path), 'r'), Loader=yaml.FullLoader)
    return nyc_data_urls

In [7]:
nyc_data_urls['nypd_shooting_ytd']

'https://data.cityofnewyork.us/api/views/5ucz-vwe8/rows.csv?accessType=DOWNLOAD'

In [None]:
def download_from_data_portal(data_url, file_name, max_days_before_update = 7, 
                                      DEBUG = False):
    dir_path = os.path.join(DATA_DIR)
    file_path = os.path.join(dir_path, file_name)
    if DEBUG: print(f"dir_path:  {dir_path}")
    if DEBUG: print(f"file_path: {file_path}")
    ensure_dir_exists(dir_path)
    if (os.path.isfile(file_path)):
        file_datetime = datetime.fromtimestamp(os.path.getmtime(file_path))
        file_age = (datetime.now() - file_datetime).seconds
    if ('file_age' in locals()) and (file_age < (max_days_before_update * 24 * 60 * 60)):
        print(f'Data already downloaded and extracted within the past {max_days_before_update} days.')        
    else:
        urlretrieve(data_url, file_path)
        print(f"{file_name} data successfully downloaded")
    if DEBUG: 
        pp = pprint.PrettyPrinter(indent=4)
        print('Local variables:')
        pp.pprint(locals())   

In [9]:
data_table_name = 'nypd_shooting_historic'

dir_path = os.path.join(DATA_DIR)
file_path = os.path.join(dir_path, f"{data_table_name}.csv")
ensure_dir_exists(dir_path)
if (os.path.isfile(file_path)):
    file_datetime = datetime.fromtimestamp(os.path.getmtime(file_path))
    file_age = (datetime.now() - file_datetime).seconds
else:
    file_age = 100 * 365 * 24 * 60 * 60
print(f"File age: {file_age} (seconds), {file_age / (24 * 60 * 60)} (days), {file_age / (365 * 24 * 60 * 60)} (years)")

File age: 3153600000 (seconds), 36500.0 (days), 100.0 (years)


In [13]:
def get_age_of_data_pull(data_table_name: str, VERBOSE = False) -> float:
    """ Checks to the raw_data file of the most recent data pull. If it exists, it returns
    the age of the file in days. If it doesn't exist, it returns 36500 days. """
    dir_path = os.path.join(DATA_DIR)
    file_path = os.path.join(dir_path, f"{data_table_name}.csv")
    ensure_dir_exists(dir_path)
    if (os.path.isfile(file_path)):
        file_datetime = datetime.fromtimestamp(os.path.getmtime(file_path))
        file_age = ((datetime.now() - file_datetime).seconds) / (24 * 60 * 60)
    else:
        file_age = 100 * 365
    if VERBOSE: 
        print(f"{file_age} (days)")
        print(f"{file_age / 365} (years)")
    return file_age

In [14]:
get_age_of_data_pull(data_table_name = 'nypd_shooting_historic')

36500

In [None]:
data_table_name = 'nypd_shooting_historic'

file_age = get_age_of_data_pull(data_table_name = 'nypd_shooting_historic')

In [None]:
data_url
file_name
max_days_before_update = 7
DEBUG = False

    dir_path = os.path.join(DATA_DIR)
    file_path = os.path.join(dir_path, file_name)
    if DEBUG: print(f"dir_path:  {dir_path}")
    if DEBUG: print(f"file_path: {file_path}")
    ensure_dir_exists(dir_path)
    if (os.path.isfile(file_path)):
        file_datetime = datetime.fromtimestamp(os.path.getmtime(file_path))
        file_age = (datetime.now() - file_datetime).seconds
    if ('file_age' in locals()) and (file_age < (max_days_before_update * 24 * 60 * 60)):
        print(f'Data already downloaded and extracted within the past {max_days_before_update} days.')        
    else:
        urlretrieve(data_url, file_path)
        print(f"{file_name} data successfully downloaded")
    if DEBUG: 
        pp = pprint.PrettyPrinter(indent=4)
        print('Local variables:')
        pp.pprint(locals())   