In [None]:
from pathlib import Path

# This is the name of your environment
venv = 'test'

if not Path(venv).is_dir():
    !python3 bootstrap_env.py {venv}

In [45]:
from bootstrap_env import main

In [None]:
from sodapy import Socrata
import datetime as dt
import pandas as pd
import os

from dotenv import load_dotenv
load_dotenv()

True

## Cleaning

In [None]:
def get_df():
    domain_src = 'data.cityofnewyork.us'
    resource_id = '43nn-pn8j'

    years = 7
    dateLimit = (dt.datetime.now() - dt.timedelta(days = years * 365)).date().isoformat()
    client = Socrata(domain_src, os.environ.get('nyc_open_key'))
    
    select_clause = (
        'camis,'
        'boro,'
        'zipcode,'
        'cuisine_description AS cuisine,'
        'inspection_date,'
        'inspection_type,'
        'action,'
        'violation_code,'
        'critical_flag,'
        'score,'
        'census_tract,'
        'nta,'
        'latitude,'
        'longitude'
    )

    where_clause = f'inspection_date > "{dateLimit}" AND cuisine IS NOT NULL'

    return pd.DataFrame.from_records(
        client.get(
            resource_id,
            select = select_clause,
            where = where_clause,
            limit = int(1e6)
        )
    )

In [52]:
raw_file = Path('resources/spatial_raw_open_nyc.csv')

if raw_file.exists():
    df = pd.read_csv(raw_file)
else:
    df = get_df()
    df.to_csv(raw_file, header = True, index = False)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277236 entries, 0 to 277235
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   camis            277236 non-null  object
 1   boro             277236 non-null  object
 2   zipcode          274529 non-null  object
 3   cuisine          277236 non-null  object
 4   inspection_date  277236 non-null  object
 5   inspection_type  277236 non-null  object
 6   action           277236 non-null  object
 7   critical_flag    277236 non-null  object
 8   score            265723 non-null  object
 9   census_tract     274009 non-null  object
 10  nta              273986 non-null  object
 11  latitude         276953 non-null  object
 12  longitude        276953 non-null  object
 13  violation_code   275485 non-null  object
dtypes: object(14)
memory usage: 29.6+ MB


In [54]:
action_map = {
    'Violations were cited in the following area(s).': 'cited_violation',
    'Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.': 'cited_violations_and_closed',
    'No violations were recorded at the time of this inspection.': 'no_violations',
    'Establishment re-opened by DOHMH.': 'reopened',
    'Establishment re-closed by DOHMH.': 'reclosed'
}

df['action'] = df['action'].map(action_map)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277236 entries, 0 to 277235
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   camis            277236 non-null  object
 1   boro             277236 non-null  object
 2   zipcode          274529 non-null  object
 3   cuisine          277236 non-null  object
 4   inspection_date  277236 non-null  object
 5   inspection_type  277236 non-null  object
 6   action           277236 non-null  object
 7   critical_flag    277236 non-null  object
 8   score            265723 non-null  object
 9   census_tract     274009 non-null  object
 10  nta              273986 non-null  object
 11  latitude         276953 non-null  object
 12  longitude        276953 non-null  object
 13  violation_code   275485 non-null  object
dtypes: object(14)
memory usage: 29.6+ MB


In [58]:
new_cols = ['inspection_type', 'inspection_subtype']

df[new_cols] = (
    df['inspection_type']
    .str
        .split('/', n = 1, expand = True)
    .rename(
        columns = {0: new_cols[0], 1: new_cols[1]}
    )
)

for col in new_cols:
    df[col] = df[col].str.strip()

In [126]:
cols = list(df.columns)
new_order = cols[:6]
new_order.extend(reversed(cols[-2:]))
new_order.extend(cols[6:-2])

df = df[new_order]

In [130]:
df = df.convert_dtypes()

In [133]:
mask = (df['critical_flag'] == 'Not Applicable', 'score')
df.loc[mask] = df.loc[mask].fillna('0')

mask = (df['action'] == 'no_violations', 'score')
df.loc[mask] = df.loc[mask].fillna('0')

mask = (df['violation_code'].isna())
df.loc[mask, 'violation_code'] = df.loc[mask, 'violation_code'].fillna('None')
df.loc[mask, 'score'] = df.loc[mask, 'score'].fillna('0')

In [135]:
df.dropna(how = 'any', inplace = True)

int_cols = ['camis', 'zipcode', 'score', 'census_tract']
df[int_cols] = df[int_cols].astype(int)

flt_cols = ['latitude', 'longitude']
df[flt_cols] = df[flt_cols].astype(float).round(5)

date_cols = ['inspection_date']
df[date_cols] = df[date_cols].apply(pd.to_datetime)

In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 265049 entries, 0 to 277235
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   camis               265049 non-null  int64         
 1   boro                265049 non-null  string        
 2   zipcode             265049 non-null  int64         
 3   cuisine             265049 non-null  string        
 4   inspection_date     265049 non-null  datetime64[ns]
 5   inspection_type     265049 non-null  string        
 6   inspection_subtype  265049 non-null  string        
 7   violation_code      265049 non-null  string        
 8   action              265049 non-null  string        
 9   critical_flag       265049 non-null  string        
 10  score               265049 non-null  int64         
 11  census_tract        265049 non-null  int64         
 12  nta                 265049 non-null  string        
 13  latitude            265049 non-nul

In [137]:
df.to_csv('spatial_cleaned_inspections.csv', header = True, index = False)

In [1]:
import pandas as pd
df = pd.read_csv('spatial_cleaned_inspections.csv')

In [4]:
getattr(df, 'info')()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265049 entries, 0 to 265048
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   camis               265049 non-null  int64  
 1   boro                265049 non-null  object 
 2   zipcode             265049 non-null  int64  
 3   cuisine             265049 non-null  object 
 4   inspection_date     265049 non-null  object 
 5   inspection_type     265049 non-null  object 
 6   inspection_subtype  265049 non-null  object 
 7   violation_code      263356 non-null  object 
 8   action              265049 non-null  object 
 9   critical_flag       265049 non-null  object 
 10  score               265049 non-null  int64  
 11  census_tract        265049 non-null  int64  
 12  nta                 265049 non-null  object 
 13  latitude            265049 non-null  float64
 14  longitude           265049 non-null  float64
dtypes: float64(2), int64(4), object(9)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265049 entries, 0 to 265048
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   camis               265049 non-null  int64  
 1   boro                265049 non-null  object 
 2   zipcode             265049 non-null  int64  
 3   cuisine             265049 non-null  object 
 4   inspection_date     265049 non-null  object 
 5   inspection_type     265049 non-null  object 
 6   inspection_subtype  265049 non-null  object 
 7   violation_code      263356 non-null  object 
 8   action              265049 non-null  object 
 9   critical_flag       265049 non-null  object 
 10  score               265049 non-null  int64  
 11  census_tract        265049 non-null  int64  
 12  nta                 265049 non-null  object 
 13  latitude            265049 non-null  float64
 14  longitude           265049 non-null  float64
dtypes: float64(2), int64(4), object(9)

<!-- ## Preprocessing -->

### Postgres Connection Stuff

In [3]:
from core import Database, get_session_factory, get_settings, get_engine
from schemas import Inspection

from sqlalchemy.dialects.postgresql import insert

In [4]:
db = Database(get_session_factory())

In [5]:
rows = df.to_dict('records')
with db.get_session() as session:
    stmt = insert(Inspection).values(rows)
    stmt = stmt.on_conflict_do_nothing(
        constraint = 'uq_inspection_natural'
    )
    session.execute(stmt)

In [None]:
from ETL.extractors.sql_db import Postgres_Features
df = Postgres_Features().extract()

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264765 entries, 0 to 264764
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  264765 non-null  int64         
 1   camis               264765 non-null  int64         
 2   boro                264765 non-null  object        
 3   zipcode             264765 non-null  int64         
 4   cuisine             264765 non-null  object        
 5   inspection_date     264765 non-null  datetime64[ns]
 6   inspection_type     264765 non-null  object        
 7   inspection_subtype  264765 non-null  object        
 8   action              264765 non-null  object        
 9   violation_code      264765 non-null  object        
 10  critical_flag       264765 non-null  object        
 11  score               264765 non-null  int64         
 12  census_tract        264765 non-null  int64         
 13  nta                 264765 no