## Project 1: NYC Resteraunt Violations Cleaning Up
### Jacob Minkin

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
sns.set_style('darkgrid')

%matplotlib inline

## Set-Up

In [8]:
#Import Data
df = pd.read_csv('../data/Inspection.csv')

#Change Name of Column for easier searching
df.rename(columns = {'VIOLATION CODE':'CODE'}, inplace = True)

# Convert date into pandas date/time format
df['date'] = pd.to_datetime(df['INSPECTION DATE'])

# Create Dictionaries for Violation Codes and Resteraunt Names
code_list = df.CODE[df.CODE.isna() == False].unique()
violation_dicts = {}
for name in code_list:
    violation_dicts[name] = df.VIOLATION[df.CODE == name].unique()

Name_list = df.CAMIS[df.CAMIS.isna() == False].unique()
Name_dicts = {}
for name in Name_list:
    Name_dicts[name] = df.DBA[df.CAMIS == name].unique()

#Drop Columns: most of these are empty cols to beging with
df = df.drop(['Zip Codes', 'City Council Districts', 'Police Precincts', 'Location Point', 'Community Districts', 
   'Borough Boundaries',  'GRADE DATE', 'PHONE', 'INSPECTION DATE', 'DBA', 'VIOLATION', 'RECORD DATE'], axis = 1)

# List of CONVERSION to categorical functions. 
to_convert = ['ZIPCODE', 'Community Board']
df[to_convert] = df[to_convert].astype('category')

## Examine Missing Data

In [9]:
df.isnull().sum(axis = 0).sort_values(ascending = False)

GRADE               106150
SCORE                10915
CODE                  5681
BIN                   4561
CUISINE               4358
ACTION                4358
INSPECTION TYPE       4358
NTA                   3468
Community Board       3468
Council District      3468
Census Tract          3468
ZIPCODE               2933
BBL                    561
BUILDING               340
Longitude              298
Latitude               298
STREET                  26
BORO                     0
date                     0
CRITICAL FLAG            0
RECORD DATE              0
CAMIS                    0
dtype: int64

In [34]:
finalVariables = ['CAMIS','CUISINE','STREET', 'ZIPCODE', 'BBL', 'BORO', 'date', 'CRITICAL FLAG']
finalDataframe = df[finalVariables]

In [99]:
gk = df.sort_values(by=['date', 'GRADE'], ascending=False).groupby(by = ['CAMIS'], sort=False).nth(0)
gk.ACTION

CAMIS
50114345                  Establishment re-opened by DOHMH.
41603616                  Establishment re-opened by DOHMH.
50110842    Violations were cited in the following area(s).
50006741    Violations were cited in the following area(s).
50085148    Violations were cited in the following area(s).
                                 ...                       
50125351                                                NaN
50116200                                                NaN
50123266                                                NaN
50119602                                                NaN
50131767                                                NaN
Name: ACTION, Length: 28239, dtype: object

In [84]:
len(df.CAMIS.unique())

28239

In [54]:
df['RECORD DATE'].value_counts()

2/13/2023    209579
Name: RECORD DATE, dtype: int64

In [13]:
df.ACTION.value_counts()

Violations were cited in the following area(s).                                                                                       193989
Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.      7936
Establishment re-opened by DOHMH.                                                                                                       1985
No violations were recorded at the time of this inspection.                                                                             1299
Establishment re-closed by DOHMH.                                                                                                         12
Name: ACTION, dtype: int64

In [40]:
Name_dicts

{50107551: array([nan], dtype=object),
 50131697: array([nan], dtype=object),
 50123820: array([nan], dtype=object),
 50119514: array([nan], dtype=object),
 50127198: array([nan], dtype=object),
 50131632: array([nan], dtype=object),
 50121462: array([nan], dtype=object),
 50112539: array([nan], dtype=object),
 50109522: array([nan], dtype=object),
 50129927: array([nan], dtype=object),
 41564948: array(['American'], dtype=object),
 50113426: array([nan], dtype=object),
 50128218: array([nan], dtype=object),
 50104825: array([nan], dtype=object),
 50112870: array([nan], dtype=object),
 50120379: array([nan], dtype=object),
 50124520: array([nan], dtype=object),
 50124855: array([nan], dtype=object),
 50132225: array([nan], dtype=object),
 50116143: array([nan], dtype=object),
 50129141: array([nan], dtype=object),
 50121299: array([nan], dtype=object),
 50118414: array([nan], dtype=object),
 50008108: array(['Bakery Products/Desserts'], dtype=object),
 50108007: array([nan], dtype=obje

In [37]:
def search(values, searchFor):
    saving = {}
    for k in values:
        for v in values[k]:
            if searchFor in v:
                saving[k] = k
    return saving

{}