In [2]:
# Import the dependencies.

# ETL
# ---
import pandas as pd
import requests
import io
from pathlib import Path
import datetime as dt

# DB Backend
# -----------
from sqlalchemy import create_engine, Column, Integer, Float, String, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base
from sqlalchemy.ext.automap import automap_base

# Flask API Hosting
# -----------------
from flask import Flask, jsonify, request, abort

In [3]:
# ETL
# ----
# -----

# EXTRACTION (LIGHT TRANSFORMATION PERFORMED DURING EXTRACTION)
#--------------------------------------------------------------

# API Call/Avoiding API if file already exists --> DF creation

rawFile = Path('../csv_files/raw/dohmh_original.csv')
if (Path.exists(rawFile)):
    df = pd.read_csv(rawFile)
else:
    # Build select statement with aliases
    q_select = (
        'camis AS id,'
        'dba AS name,'
        'boro AS borough,'
        'cuisine_description AS cuisine,'
        'inspection_date,'
        'latitude AS lat,'
        'longitude AS lng'
    )

    # Build filters for date (within 2 years) and no nulls for cuisine, lat, or lng
    dateLimit = (dt.datetime.now() - dt.timedelta(days = 2 * 365)).isoformat()
    filter_dt = f'inspection_date > "{dateLimit}"'
    notNull = 'IS NOT NULL'
    filter_NA = \
        f'cuisine {notNull} AND lat {notNull} AND lng {notNull}'
    
    # Init full filters for API call with limit
    q_where = f'{filter_dt} AND {filter_NA}'
    q_limit = 200000
    
    # Base URL
    url = 'https://data.cityofnewyork.us/resource/43nn-pn8j.csv'

    # Parameters to send with API Call
    params = {
        '$select': q_select,
        '$where': q_where,
        '$limit': q_limit
    }
    # API Call itself using socrata (SODA) querying
    response = requests.get(url, params)

    # Using io.StringIO to create pseudo CSV file for export and reading
    csv = io.StringIO(response.content.decode('utf-8'))
    df = pd.read_csv(csv)

    # MINOR LOADING OF ORIGINAL DATASET RETRIEVED
    df.to_csv(rawFile, header = True, index = False)


# TRANSFORMATION
# --------------

# Correcting date type --> datetime (doesn't need times or tz info)
df['inspection_date'] = pd.to_datetime(df['inspection_date'])

# Groupy by to resolve outdated records (grab most recent ones only)
uniqueLocs = df.groupby('id')['inspection_date'].max().reset_index(drop = False)
df = uniqueLocs.merge(df, how = 'left').copy()

# Multiple most recent records per id so drop exact duplicates
df = df.drop_duplicates(keep = 'last')

# Reorder to correct columns
df = df[
    ['id', 'name', 'borough', 'cuisine', 'inspection_date', 'lat', 'lng']
].reset_index(drop = True)

cleanFile = Path('../csv_files/clean/dohmh_clean.csv')
df.to_csv(cleanFile, header = True, index = False)

In [None]:
Base = declarative_base()

class NYC_DOHMH(Base):
    __tablename__ = 'nyc_dohmh'

    id = Column(Integer, primary_key = True)
    name = Column(String, nullable = False, unique = True)
    borough = Column(String, nullable = False)
    cuisine = Column(String, nullable = False)
    inspection_date = \
        Column(DateTime, nullable = False)
    lat = Column(Float, nullable = False)
    lng = Column(Float, nullable = False)