In [48]:
import pandas as pd
import requests
import io
from pathlib import Path
import datetime as dt

In [49]:
# Help with querying API: https://dev.socrata.com/docs/queries/

dirtyFile = Path('dohmh_dirty.csv')
if (Path.exists(dirtyFile)):
    df = pd.read_csv(dirtyFile)
else:
    # Build select statement with aliases
    q_select = (
        'camis AS id,'
        'dba AS name,'
        'boro AS borough,'
        'cuisine_description AS cuisine,'
        'inspection_date,'
        'latitude AS lat,'
        'longitude AS lng'
    )

    # Build filters
    dateLimit = (dt.datetime.now() - dt.timedelta(days = 2 * 365)).isoformat()
    filter_dt = f'inspection_date > "{dateLimit}"'
    notNull = 'IS NOT NULL'
    filter_NA = \
        f'cuisine {notNull} AND lat {notNull} AND lng {notNull}'
    
    # Init full filters and limit
    q_where = f'{filter_dt} AND {filter_NA}'
    q_limit = 200000
    
    url = 'https://data.cityofnewyork.us/resource/43nn-pn8j.csv'
    params = {
        '$select': q_select,
        '$where': q_where,
        '$limit': q_limit
    }
    response = requests.get(url, params)
    # Using io.StringIO to create pseudo CSV file for export and reading
    csv = io.StringIO(response.content.decode('utf-8'))
    df = pd.read_csv(csv)
    df.to_csv(dirtyFile, header = True, index = False)

df.head(10)

Unnamed: 0,id,name,borough,cuisine,inspection_date,lat,lng
0,50149099,LIV KTV & PARTY,Queens,American,2024-04-04T00:00:00.000,40.762268,-73.831539
1,50119603,MALALA CHINESE,Queens,Chinese,2023-04-24T00:00:00.000,40.763064,-73.913269
2,41248866,BARAKA BUFFET,Manhattan,African,2024-09-12T00:00:00.000,0.0,0.0
3,50137610,MARI:NE,Manhattan,Korean,2024-09-09T00:00:00.000,40.754684,-73.985797
4,50048817,BARN JOO 35,Manhattan,Korean,2023-10-02T00:00:00.000,40.749641,-73.985405
5,50147088,UTSHOB RESTAURANT,Manhattan,Indian,2024-09-04T00:00:00.000,40.727428,-73.985395
6,50143746,MR ZHANG NOODLE,Queens,Chinese,2024-09-14T00:00:00.000,40.758502,-73.833242
7,40537963,NEW RUAN'S RESTAURANT,Brooklyn,Chinese,2023-09-27T00:00:00.000,40.605141,-73.999132
8,41395419,CITI FIELD STAND 402,Queens,American,2024-09-20T00:00:00.000,0.0,0.0
9,50103595,FRAME,Manhattan,American,2024-06-06T00:00:00.000,40.754508,-73.988071


In [50]:
# Correcting date type --> datetime (doesn't need times or tz info)
df['inspection_date'] = pd.to_datetime(df['inspection_date'])

# Groupy by to resolve outdated records (grab most recent ones only)
uniqueLocs = df.groupby('id')['inspection_date'].max().reset_index(drop = False)
df = uniqueLocs.merge(df, how = 'left').copy()

# Multiple most recent records per id so drop exact duplicates
df = df.drop_duplicates(keep = 'last')

In [52]:
cleanFile = Path('dohmh_clean.csv')
df.to_csv(cleanFile, header = True, index = False)