In [99]:
from sodapy import Socrata
from dotenv import load_dotenv
from os import environ
import datetime as dt
import pandas as pd

In [100]:
domain_src = 'data.cityofnewyork.us'
resource_id = '43nn-pn8j'

load_dotenv()
years = 2
dateLimit = (dt.datetime.now() - dt.timedelta(days = years * 365)).isoformat()
client = Socrata(domain_src, environ.get('nyc_open_key'))

In [101]:
select_clause = (
    'camis AS id,'
    'boro,'
    'zipcode,'
    'cuisine_description AS cuisine,'
    'inspection_date,'
    'action,'
    'violation_code,'
    'critical_flag,'
    'score,'
    'record_date,'
    'inspection_type,'
    'census_tract,'
    'nta'
)

where_clause = f'inspection_date > "{dateLimit}" AND cuisine IS NOT NULL'

df = pd.DataFrame.from_records(
    client.get(
        resource_id,
        select = select_clause,
        where = where_clause,
        limit = 200000
    )
)

In [102]:
df['action'].value_counts()

action
Violations were cited in the following area(s).                                                                                       157154
Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.      6003
No violations were recorded at the time of this inspection.                                                                             1198
Establishment re-opened by DOHMH.                                                                                                        996
Establishment re-closed by DOHMH.                                                                                                        814
Name: count, dtype: int64

In [103]:
action_map = {
    'Violations were cited in the following area(s).': 'cited_violation',
    'Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.': 'cited_violations_and_closed',
    'No violations were recorded at the time of this inspection.': 'no_violations',
    'Establishment re-opened by DOHMH.': 'reopened',
    'Establishment re-closed by DOHMH.': 'reclosed'
}

In [104]:
df['action'] = df['action'].map(action_map)

- Drop violation code nulls

In [105]:
count = 0
for i, r in df['violation_code'].value_counts().items():
    if r < 20: count += 1

In [106]:
df['violation_code'].value_counts().describe()

count      109.000000
mean      1513.605505
std       3446.401553
min          1.000000
25%         21.000000
50%        134.000000
75%       1308.000000
max      23247.000000
Name: count, dtype: float64

In [107]:
df['score']

0         13
1          8
2         32
3         10
4         40
          ..
166160    26
166161    34
166162     0
166163     0
166164     0
Name: score, Length: 166165, dtype: object

In [108]:
df['score'].value_counts().describe()

count      138.000000
mean      1148.449275
std       2037.987067
min          5.000000
25%         49.500000
50%        357.500000
75%       1499.250000
max      13748.000000
Name: count, dtype: float64

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166165 entries, 0 to 166164
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               166165 non-null  object
 1   boro             166165 non-null  object
 2   zipcode          164679 non-null  object
 3   cuisine          166165 non-null  object
 4   inspection_date  166165 non-null  object
 5   action           166165 non-null  object
 6   violation_code   164983 non-null  object
 7   critical_flag    166165 non-null  object
 8   score            158486 non-null  object
 9   record_date      166165 non-null  object
 10  inspection_type  166165 non-null  object
 11  census_tract     164394 non-null  object
 12  nta              164378 non-null  object
dtypes: object(13)
memory usage: 16.5+ MB


In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166165 entries, 0 to 166164
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               166165 non-null  object
 1   boro             166165 non-null  object
 2   zipcode          164679 non-null  object
 3   cuisine          166165 non-null  object
 4   inspection_date  166165 non-null  object
 5   action           166165 non-null  object
 6   violation_code   164983 non-null  object
 7   critical_flag    166165 non-null  object
 8   score            158486 non-null  object
 9   record_date      166165 non-null  object
 10  inspection_type  166165 non-null  object
 11  census_tract     164394 non-null  object
 12  nta              164378 non-null  object
dtypes: object(13)
memory usage: 16.5+ MB


In [None]:
# df.dropna(inplace = True)