In [2]:
# Required Packages
import pandas as pd
from datetime import datetime
import numpy as np

import warnings
warnings.filterwarnings("ignore")

#### Table of Contents

* [Los Angeles County Restaurant and Market Violations and Inspections](#Los-Angeles-County-Restaurant-and-Market-Violations-and-Inspections)
    * [Violations Dataset](#Violations-Dataset)
    * [Inspection Result Dataset](#Inspection-Result-Dataset)
* [Preprocessing](#Preprocessing)
    * [Inspection Dataset](#Inspection-Dataset)
    * [Creating a new Dataset](#Creating-a-new-Dataset)
        * [Reindexing](#Reindexing)
    * [Creating a Dictionary](#Creating-a-Dictionary)

# Los Angeles County Restaurant and Market Violations and Inspections


In this article, we use a dataset contains Environmental Health Violations for Restaurants and Markets in Los Angeles County. This dataset can be found [here](https://data.lacounty.gov/Health/LOS-ANGELES-COUNTY-RESTAURANT-AND-MARKET-VIOLATION/8jyd-4pv9) and [here](https://data.lacounty.gov/Health/LOS-ANGELES-COUNTY-RESTAURANT-AND-MARKET-INSPECTIO/6ni6-h5kp).

Los Angeles County Environmental Health is responsible for checking food violations for all unincorporated areas and 85 of the 88 cities in the County. This dataset does not include Pasadena, Long Beach or Vernon (each has its own city health department).

## Violations Dataset

Each row represents one health code violation. All rows with the same Serial Number are part of the same inspection. The Serial Number is the primary key to review the inspection grade (Inspection Result dataset)

In [3]:
Violations_df = pd.read_csv('Data/LOS_ANGELES_COUNTY_RESTAURANT_AND_MARKET_VIOLATIONS.csv')
Violations_df.head().style.hide_index()

SERIAL NUMBER,VIOLATION STATUS,VIOLATION CODE,VIOLATION DESCRIPTION,POINTS
DAN0UK2OK,OUT OF COMPLIANCE,F043,# 43. Premises; personal/cleaning items; vermin-proofing,1
DAN0UK2OK,OUT OF COMPLIANCE,F053,# 21a. Hot Water Available,2
DAN0UK2OK,OUT OF COMPLIANCE,F044,"# 44. Floors, walls and ceilings: properly built, maintained in good repair and clean",1
DAN0UK2OK,OUT OF COMPLIANCE,F038,# 38. Thermometers provided and accurate,1
DA0817361,OUT OF COMPLIANCE,F038,# 38. Thermometers provided and accurate,1


## Inspection Result Dataset

Each row represents one inspection result. The Serial Number is the primary key to access/review the violations (Violations Dataset) associated with a particular inspection.

In [4]:
Inspections_df = pd.read_csv('Data/LOS_ANGELES_COUNTY_RESTAURANT_AND_MARKET_INSPECTIONS.csv')
Inspections_df.head().style.hide_index()

ACTIVITY DATE,OWNER ID,OWNER NAME,FACILITY ID,FACILITY NAME,RECORD ID,PROGRAM NAME,PROGRAM STATUS,PROGRAM ELEMENT (PE),PE DESCRIPTION,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SERVICE CODE,SERVICE DESCRIPTION,SCORE,GRADE,SERIAL NUMBER,EMPLOYEE ID,Location
04/12/2017,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,
04/12/2017,OW0181132,THAI CITY DUARTE INC.,FA0174471,THAI CITY RESTAURANT,PR0167479,THAI CITY RESTAURANT,ACTIVE,1635,RESTAURANT (31-60) SEATS HIGH RISK,2414 E HUNTINGTON DR,DUARTE,CA,91010,1,ROUTINE INSPECTION,90,A,DACH7Y18F,EE0000120,
04/13/2017,OW0125502,SWEE KOK CHER,FA0160544,SUSHI FIRE,PR0148634,SUSHI FIRE,ACTIVE,1635,RESTAURANT (31-60) SEATS HIGH RISK,1723 E HUNTINGTON DR,DUARTE,CA,91010,1,ROUTINE INSPECTION,90,A,DAEOLNF0Z,EE0000120,
04/13/2017,OW0010326,DOS MONTANAS LLC,FA0012454,TIPSY WINGS,PR0018288,TIPSY WINGS,INACTIVE,1638,RESTAURANT (61-150) SEATS HIGH RISK,11672 RAMONA BLVD,EL MONTE,CA,91732,1,ROUTINE INSPECTION,95,A,DABTUPVHJ,EE0000188,POINT (-118.020082 34.07441)
04/19/2017,OW0107342,MARIELENA RODRIGUEZ,FA0144532,ARI'S SHOP,PR0130618,ARI'S SHOP,ACTIVE,1610,"FOOD MKT RETAIL (1-1,999 SF) LOW RISK",OLVERA ST E3,LOS ANGELES,CA,90012,1,ROUTINE INSPECTION,100,A,DAL4G6OJH,EE0001058,


# Preprocessing

## Inspection Dataset

Converting the activty date into datetime format.

In [5]:
Inspections_df['ACTIVITY DATE'] = pd.to_datetime(Inspections_df['ACTIVITY DATE'], format='%m/%d/%Y')
Inspections_df.head().sort_values('ACTIVITY DATE', ascending=False)

Unnamed: 0,ACTIVITY DATE,OWNER ID,OWNER NAME,FACILITY ID,FACILITY NAME,RECORD ID,PROGRAM NAME,PROGRAM STATUS,PROGRAM ELEMENT (PE),PE DESCRIPTION,...,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SERVICE CODE,SERVICE DESCRIPTION,SCORE,GRADE,SERIAL NUMBER,EMPLOYEE ID,Location
4,2017-04-19,OW0107342,MARIELENA RODRIGUEZ,FA0144532,ARI'S SHOP,PR0130618,ARI'S SHOP,ACTIVE,1610,"FOOD MKT RETAIL (1-1,999 SF) LOW RISK",...,LOS ANGELES,CA,90012,1,ROUTINE INSPECTION,100,A,DAL4G6OJH,EE0001058,
2,2017-04-13,OW0125502,SWEE KOK CHER,FA0160544,SUSHI FIRE,PR0148634,SUSHI FIRE,ACTIVE,1635,RESTAURANT (31-60) SEATS HIGH RISK,...,DUARTE,CA,91010,1,ROUTINE INSPECTION,90,A,DAEOLNF0Z,EE0000120,
3,2017-04-13,OW0010326,DOS MONTANAS LLC,FA0012454,TIPSY WINGS,PR0018288,TIPSY WINGS,INACTIVE,1638,RESTAURANT (61-150) SEATS HIGH RISK,...,EL MONTE,CA,91732,1,ROUTINE INSPECTION,95,A,DABTUPVHJ,EE0000188,POINT (-118.020082 34.07441)
0,2017-04-12,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,...,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,
1,2017-04-12,OW0181132,THAI CITY DUARTE INC.,FA0174471,THAI CITY RESTAURANT,PR0167479,THAI CITY RESTAURANT,ACTIVE,1635,RESTAURANT (31-60) SEATS HIGH RISK,...,DUARTE,CA,91010,1,ROUTINE INSPECTION,90,A,DACH7Y18F,EE0000120,


Moreover, note that

In [6]:
Inspections_df['FACILITY ZIP'].unique()

array(['90638', '91010', '91732', ..., '91601-2437', '90012-2314',
       '90650-3506'], dtype=object)

We would like to keep only the first five digits of each zip code (right before the dashed line). Thus,

In [7]:
Inspections_df['FACILITY ZIP'] = Inspections_df['FACILITY ZIP'].apply(lambda x: x[:5])

## Creating a new Dataset

Moreover, note that the two databases only have a mutual column

In [8]:
list(Inspections_df.columns & Violations_df.columns)

['SERIAL NUMBER']

In [9]:
Data=pd.merge(Inspections_df, Violations_df, on='SERIAL NUMBER', how='right')
Data.head().style.hide_index()

ACTIVITY DATE,OWNER ID,OWNER NAME,FACILITY ID,FACILITY NAME,RECORD ID,PROGRAM NAME,PROGRAM STATUS,PROGRAM ELEMENT (PE),PE DESCRIPTION,FACILITY ADDRESS,FACILITY CITY,FACILITY STATE,FACILITY ZIP,SERVICE CODE,SERVICE DESCRIPTION,SCORE,GRADE,SERIAL NUMBER,EMPLOYEE ID,Location,VIOLATION STATUS,VIOLATION CODE,VIOLATION DESCRIPTION,POINTS
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F033,# 33. Nonfood-contact surfaces clean and in good repair,1
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F044,"# 44. Floors, walls and ceilings: properly built, maintained in good repair and clean",1
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F036,"# 36. Equipment, utensils and linens: storage and use",1
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F040,"# 40. Plumbing: Plumbing in good repair, proper backflow devices",1
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,RESTAURANT (0-30) SEATS MODERATE RISK,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F043,# 43. Premises; personal/cleaning items; vermin-proofing,1


Moreover, note that

In [10]:
Data['PE DESCRIPTION'].unique()

array(['RESTAURANT (0-30) SEATS MODERATE RISK',
       'RESTAURANT (31-60) SEATS HIGH RISK',
       'RESTAURANT (61-150) SEATS HIGH RISK',
       'RESTAURANT (151 + ) SEATS HIGH RISK',
       'FOOD MKT RETAIL (1-1,999 SF) LOW RISK',
       'FOOD MKT RETAIL (2,000+ SF) LOW RISK',
       'RESTAURANT (61-150) SEATS MODERATE RISK',
       'RESTAURANT (0-30) SEATS LOW RISK',
       'RESTAURANT (0-30) SEATS HIGH RISK',
       'RESTAURANT (151 + ) SEATS MODERATE RISK',
       'RESTAURANT (31-60) SEATS MODERATE RISK',
       'FOOD MKT RETAIL (2,000+ SF) HIGH RISK',
       'FOOD MKT RETAIL (1-1,999 SF) HIGH RISK',
       'FOOD MKT RETAIL (2,000+ SF) MODERATE RISK',
       'FOOD MKT RETAIL (1-1,999 SF) MODERATE RISK',
       'RESTAURANT (31-60) SEATS LOW RISK',
       'INTERIM HOUSING FF (2,000-4,999) SQ. FT.',
       'LIC HTH CARE FOOD FAC (10,000+ SF) HIGH RISK',
       'RESTAURANT (151 + ) SEATS LOW RISK',
       'RESTAURANT (61-150) SEATS LOW RISK',
       'INTERIM HOUSING FF (1-1,999) SQ. F

This column can be split into three columns.

In [11]:
temp = Data['PE DESCRIPTION'].str.split("(", n = 2, expand = True)
temp1 = temp.loc[:,1].str.split(")", n = 2, expand = True)

In [12]:
Data['PE_Type']=temp.loc[:,0]
del temp

As for types, we have the following types:

In [13]:
Data['PE_Type'].unique()

array(['RESTAURANT ', 'FOOD MKT RETAIL ', 'INTERIM HOUSING FF ',
       'LIC HTH CARE FOOD FAC ', 'CATERER ',
       'SWAP MEET PREPACKAGED FOOD STAND', 'PRIVATE SCHOOL CAFETERIA',
       'FOOD PROCESSING WHOLESALE ', 'FOOD VEHICLE COMMISSARY ',
       'FOOD WAREHOUSE ', 'FOOD MARKET WHOLESALE', nan], dtype=object)

In [14]:
Map = {'RESTAURANT ':'Restaurant',
       'FOOD MKT RETAIL ':'Food Market Retail',
       'INTERIM HOUSING FF ':'Interim Housing FF',
       'LIC HTH CARE FOOD FAC ':'LIC HTH Care Food FAC',
       'CATERER ':'Caterer',
       'SWAP MEET PREPACKAGED FOOD STAND':'Swap Meet Prepackaged Food Stand',
       'PRIVATE SCHOOL CAFETERIA':'Private School Cafeteria',
       'FOOD PROCESSING WHOLESALE ':'Food Processing Wholesale',
       'FOOD VEHICLE COMMISSARY ':'Food Vehicle Commissary',
       'FOOD WAREHOUSE ':'Food Warehouse',
       'FOOD MARKET WHOLESALE':'Food Market Wholesale'} 
Data['PE_Type'] = Data['PE_Type'].map(Map)

In [15]:
Data['PE_Seats']=temp1.loc[:,0]
Data['PE_Risk']=temp1.loc[:,1]
del temp1

Risk level:

In [16]:
Data['PE_Risk'].unique()

array([' SEATS MODERATE RISK', ' SEATS HIGH RISK', ' LOW RISK',
       ' SEATS LOW RISK', ' HIGH RISK', ' MODERATE RISK', ' SQ. FT.',
       'HIGH RISK', '', None, ' MOD RISK', nan], dtype=object)

In [17]:
Map = {' SEATS MODERATE RISK':'Moderate Risk',
       ' SEATS HIGH RISK':'High Risk',
       ' LOW RISK':'Low Risk',
       ' SEATS LOW RISK':'Low Risk',
       ' HIGH RISK':'High Risk',
       ' MODERATE RISK':'Moderate Risk', 
       ' SQ. FT.':None,
       'HIGH RISK':'High Risk',
       '':None,
       None:None,
       ' MOD RISK':'Moderate Risk'}
Data['PE_Risk'] = Data['PE_Risk'].map(Map)

In [18]:
Data['PE_Seats'].unique()

array(['0-30', '31-60', '61-150', '151 + ', '1-1,999 SF', '2,000+ SF',
       '2,000-4,999', '10,000+ SF', '1-1,999', '4000-9999 SF',
       '1,000-1,999 SQ. FT.', None, '2,000-5,999 SF', '0-999 SQ. FT.',
       '1-1,999 SQ. FT.', '2000-3999 SF', '0-1999 SF', '0-10',
       '6,000 + SQ. FT.', '1-4,999', nan], dtype=object)

In [19]:
Map = {'0-30':'0-30 SQ. FT.', '31-60':'31-60 SQ. FT.', '61-150 SQ. FT.':'61-150 SQ. FT.',
       '151 + ':'151+ SQ. FT.', '1-1,999 SF':'1-1999 SQ. FT.', '2,000+ SF':'2000+ SQ. FT.',
       '2,000-4,999':'2000-4999 SQ. FT.', '10,000+ SF':'10000+ SQ. FT.', '1-1,999':'1-1999 SQ. FT.',
       '4000-9999 SF':'4000-9999 SQ. FT.','1,000-1,999 SQ. FT.':'1000-1999 SQ. FT.',np.nan:'Other',
       None:'Other', '2,000-5,999 SF':'2000-5999 SQ. FT.', '0-999 SQ. FT.':'0-999 SQ. FT.',
       '1-1,999 SQ. FT.':'1-1999 SQ. FT.','2000-3999 SF':'2000-3999 SQ. FT.', '0-1999 SF':'0-1999 SQ. FT.',
       '0-10':'0-10 SQ. FT.','6,000 + SQ. FT.':'6000+ SQ. FT.', '1-4,999':'1-4999 SQ. FT.'}
Data['PE_Seats'] = Data['PE_Seats'].map(Map)

Droping **PE DESCRIPTION** column.

In [20]:
Data = Data.drop(columns='PE DESCRIPTION')

In [21]:
Data.head()[['PE_Type','PE_Seats','PE_Risk']].style.hide_index()

PE_Type,PE_Seats,PE_Risk
Restaurant,0-30 SQ. FT.,Moderate Risk
Restaurant,0-30 SQ. FT.,Moderate Risk
Restaurant,0-30 SQ. FT.,Moderate Risk
Restaurant,0-30 SQ. FT.,Moderate Risk
Restaurant,0-30 SQ. FT.,Moderate Risk


### Reindexing

In [22]:
mylist = Data.columns.tolist()
mylist = [x.title() for x in mylist]
mylist = [x.replace('Id','ID') for x in mylist]
mylist = [x.replace('Pe','PE') for x in mylist]
mylist = [x.replace('  ',' ') for x in mylist]
temp = pd.DataFrame(Data.values, columns=mylist)
Data=temp.copy()
del temp, mylist

In [23]:
Data.head().style.hide_index()

Activity Date,Owner ID,Owner Name,Facility ID,Facility Name,Record ID,Program Name,Program Status,Program Element (PE),Facility Address,Facility City,Facility State,Facility Zip,Service Code,Service Description,Score,Grade,Serial Number,Employee ID,Location,Violation Status,Violation Code,Violation Description,Points,PE_Type,PE_Seats,PE_Risk
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F033,# 33. Nonfood-contact surfaces clean and in good repair,1,Restaurant,0-30 SQ. FT.,Moderate Risk
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F044,"# 44. Floors, walls and ceilings: properly built, maintained in good repair and clean",1,Restaurant,0-30 SQ. FT.,Moderate Risk
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F036,"# 36. Equipment, utensils and linens: storage and use",1,Restaurant,0-30 SQ. FT.,Moderate Risk
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F040,"# 40. Plumbing: Plumbing in good repair, proper backflow devices",1,Restaurant,0-30 SQ. FT.,Moderate Risk
2017-04-12 00:00:00,OW0003351,AUDREY KIM BRIDAL COLLECTION,FA0010275,BOBA YA,PR0029878,BOBA YA,ACTIVE,1631,14748 S BEACH BLVD,LA MIRADA,CA,90638,1,ROUTINE INSPECTION,95,A,DAH2Z9080,EE0000744,,OUT OF COMPLIANCE,F043,# 43. Premises; personal/cleaning items; vermin-proofing,1,Restaurant,0-30 SQ. FT.,Moderate Risk


In [24]:
Data.dtypes

Activity Date            datetime64[ns]
Owner ID                         object
Owner Name                       object
Facility ID                      object
Facility Name                    object
Record ID                        object
Program Name                     object
Program Status                   object
Program Element (PE)             object
Facility Address                 object
Facility City                    object
Facility State                   object
Facility Zip                     object
Service Code                     object
Service Description              object
Score                            object
Grade                            object
Serial Number                    object
Employee ID                      object
Location                         object
Violation Status                 object
Violation Code                   object
Violation Description            object
Points                           object
PE_Type                          object


### Saving

In [25]:
Violations_df.to_csv('Data/Clean_Violations_df.csv',index=False)
Inspections_df.to_csv('Data/Clean_Inspections_df.csv',index=False)
Data.to_csv('Data/Data_Inspections_Violations_df.csv',index=False)

## Creating a Dictionary

In [40]:
Violation_Dictionary = dict.fromkeys(Data['Violation Code'].unique().tolist()) 
temp = Data['Violation Description'].unique().tolist()
for i in Violation_Dictionary:
    for j in temp:
        # Comparing the two lists
        if i[-2:] == j[2:4]:
            Violation_Dictionary[i] = j
del temp

Adding a few more codes:

In [41]:
Violation_Dictionary['F001'] = '# 01a. Demonstration of knowledge'
Violation_Dictionary['F052'] = '# 01b. Food safety certification'
Violation_Dictionary['F053'] = '# 21a. Hot Water Available'
Violation_Dictionary['F054'] = '# 52. Multiple Major Critical Violations / Increased Risk to Public Health'
Violation_Dictionary['F055'] = '# 01a. Demonstration of knowledge'
Violation_Dictionary['F057'] = '# 18. Compliance with variance, specialized process, & HACCP Plan'
Violation_Dictionary['F058'] = '# 19. Consumer advisory provided for raw or undercooked foods'

In [42]:
Violation_Dictionary_df = pd.DataFrame.from_dict(Violation_Dictionary, orient='index', columns=['Violation Description'])
Violation_Dictionary_df = Violation_Dictionary_df.sort_values('Violation Description', ascending=False)
Violation_Dictionary_df = Violation_Dictionary_df.reset_index().rename(columns={'index':'Violation Code'})
Violation_Dictionary_df.head(10)

Unnamed: 0,Violation Code,Violation Description
0,F054,# 52. Multiple Major Critical Violations / Inc...
1,W052,# 52. Multiple Major Critical Violations / Inc...
2,W051,# 51. Permit Suspension
3,F051,# 51. Permit Suspension
4,W050,# 50. Impoundment of unsanitary equipment or food
5,F050,# 50. Impoundment of unsanitary equipment or food
6,F049,# 49. Samples Collected
7,F048,# 48. Plan Review required for new or remodel ...
8,W048,# 48. Plan Review required for new or remodel ...
9,F047,# 47. Permits Available


### Saving

In [43]:
Violation_Dictionary_df.to_csv('Data/Violation_Dictionary_df.csv',index=False)

***