# NYC Restaurant Health Inspection Data Cleaning

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests
import os
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 2. Load Dataset

In [2]:
# Make directory for data if it doesn't exist
if not os.path.exists('../data'):
    os.makedirs('../data')

URL = r"https://data.cityofnewyork.us/resource/43nn-pn8j.csv"

# Download the dataset

response = requests.get(URL)
open('../data/DOHMH_New_York_City_Restaurant_Inspection_Results.csv', 'wb').write(response.content)

# Load the dataset
file_path = '../data/DOHMH_New_York_City_Restaurant_Inspection_Results.csv'
df = pd.read_csv(file_path, low_memory=False)

print(df.info())
print(df.describe())
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   camis                  1000 non-null   int64  
 1   dba                    999 non-null    object 
 2   boro                   1000 non-null   object 
 3   building               972 non-null    object 
 4   street                 997 non-null    object 
 5   zipcode                908 non-null    float64
 6   phone                  1000 non-null   object 
 7   cuisine_description    203 non-null    object 
 8   inspection_date        1000 non-null   object 
 9   action                 203 non-null    object 
 10  violation_code         145 non-null    object 
 11  violation_description  145 non-null    object 
 12  critical_flag          1000 non-null   object 
 13  score                  184 non-null    float64
 14  grade                  112 non-null    object 
 15  grade

# 3. Explore Dataset

In [3]:
df.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,grade_date,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta,location
0,50167878,GOLDEN STEAMER I INC.,Manhattan,143,MOTT STREET,10013.0,6465231688,,1900-01-01T00:00:00.000,,,,Not Applicable,,,,2025-11-07T06:00:18.000,,40.718681,-73.996645,102.0,1.0,4100.0,1079581.0,1002370000.0,MN24,POINT (-73.996645049413 40.718681310365)
1,50168599,THAI FLAVOR 88 INC.,Manhattan,174,2 AVENUE,10003.0,2122542868,,1900-01-01T00:00:00.000,,,,Not Applicable,,,,2025-11-07T06:00:18.000,,40.730464,-73.986296,103.0,2.0,4000.0,1077704.0,1004530000.0,MN22,POINT (-73.986296382711 40.730463823842)
2,50162584,COZY TEA LOFT,0,141,STATE ROUTE 27,8820.0,3472619435,,1900-01-01T00:00:00.000,,,,Not Applicable,,,,2025-11-07T06:00:18.000,,,,,,,,,,
3,50174672,EL PALENQUE MEXICAN RESTAURANT CORPORATION,Brooklyn,181,WEST END AVENUE,11235.0,7182553580,,1900-01-01T00:00:00.000,,,,Not Applicable,,,,2025-11-07T06:00:18.000,,40.57734,-73.952961,315.0,48.0,62000.0,3245985.0,3087320000.0,BK17,POINT (-73.952961276652 40.577340234075)
4,50155679,ZADDY'S JERK CHICKEN,Brooklyn,686,HEGEMAN AVENUE,11207.0,7187752616,,1900-01-01T00:00:00.000,,,,Not Applicable,,,,2025-11-07T06:00:18.000,,40.66208,-73.886624,305.0,42.0,110400.0,3097445.0,3043290000.0,BK82,POINT (-73.886623536611 40.662080196538)


# 4. Initial Filtering

Based on the dataset dictionary, we will:
1. **Drop unnecessary columns** not relevant to grade prediction
2. **Remove placeholder inspection dates** (01/01/1900)
3. **Keep only Cycle Inspections** - these are the regular health inspections that result in grades (A/B/C). Other inspection types (Smoke-Free Air Act, Inter-Agency Task Force, etc.) don't produce health grades.

In [4]:
df_copy = df.copy()

# Drop unnecessary columns
drop_columns = ['phone', 'action', 'record_date', 'community_board', 'council_district', 
                'census_tract', 'bin', 'bbl', 'nta', 'location', 'latitude', 'longitude']
df_copy = df_copy.drop(columns=drop_columns)

print(f"Original shape: {df_copy.shape}")

# Remove placeholder inspection dates
drop_rows = df_copy[df_copy['inspection_date'] == '01/01/1900'].index
df_copy = df_copy.drop(index=drop_rows)
print(f"After removing placeholder dates: {df_copy.shape} (removed {len(drop_rows):,})")

# Keep only Cycle Inspections (the only ones that produce health grades)
before_count = len(df_copy)
df_copy = df_copy[df_copy['inspection_type'].str.contains('Cycle Inspection', case=False, na=False)]
print(f"After filtering to Cycle Inspections only: {df_copy.shape} (removed {before_count - len(df_copy):,})")

df_copy.head()

Original shape: (1000, 15)
After removing placeholder dates: (1000, 15) (removed 0)
After filtering to Cycle Inspections only: (146, 15) (removed 854)


Unnamed: 0,camis,dba,boro,building,street,zipcode,cuisine_description,inspection_date,violation_code,violation_description,critical_flag,score,grade,grade_date,inspection_type
12,50070543,CLARO,Manhattan,284,3 AVENUE,10010.0,Mexican,2022-09-23T00:00:00.000,02B,Hot TCS food item not held at or above 140 °F.,Critical,7.0,A,2022-09-23T00:00:00.000,Cycle Inspection / Re-inspection
19,41708524,REICHENBACH HALL,Manhattan,5,WEST 37 STREET,10018.0,German,2024-05-09T00:00:00.000,09B,Thawing procedure improper.,Not Critical,10.0,A,2024-05-09T00:00:00.000,Cycle Inspection / Re-inspection
22,41086967,IL SOLE,Manhattan,229233,DYCKMAN STREET,,Italian,2024-01-04T00:00:00.000,02B,Hot TCS food item not held at or above 140 °F.,Critical,12.0,A,2024-01-04T00:00:00.000,Cycle Inspection / Initial Inspection
33,50070454,Sushi Q,Bronx,1610,CROSBY AVENUE,10461.0,Japanese,2024-06-24T00:00:00.000,,,Not Applicable,0.0,P,2024-06-24T00:00:00.000,Cycle Inspection / Reopening Inspection
38,41486460,RIVER DELI,Brooklyn,2834,COLUMBIA PLACE,,Italian,2024-06-20T00:00:00.000,04L,Evidence of mice or live mice in establishment...,Critical,9.0,,,Cycle Inspection / Initial Inspection


# 5. Converting Data Types

In [5]:
# Convert date columns to datetime
df_copy['inspection_date'] = pd.to_datetime(df_copy['inspection_date'], format='ISO8601')
df_copy['grade_date'] = pd.to_datetime(df_copy['grade_date'], format='ISO8601', errors='coerce')

# Convert ZIPCODE from float to string (preserve leading zeros)
df_copy['zipcode'] = df_copy['zipcode'].astype('Int64').astype(str).replace('<NA>', None)

# Convert CAMIS to string (it's an ID, not a number)
df_copy['camis'] = df_copy['camis'].astype(str)

print("Data types after conversion:")
print(df_copy.dtypes)

Data types after conversion:
camis                            object
dba                              object
boro                             object
building                         object
street                           object
zipcode                          object
cuisine_description              object
inspection_date          datetime64[ns]
violation_code                   object
violation_description            object
critical_flag                    object
score                           float64
grade                            object
grade_date               datetime64[ns]
inspection_type                  object
dtype: object


# 6. Check Missing Values

In [6]:
# Check missing values
print("Missing values by column:")
print(df_copy.isnull().sum())
print(f"\nTotal rows: {len(df_copy):,}")

# Note: Some missing grades are expected for initial inspections that haven't been graded yet

Missing values by column:
camis                     0
dba                       0
boro                      0
building                  7
street                    0
zipcode                  54
cuisine_description       0
inspection_date           0
violation_code           22
violation_description    22
critical_flag             0
score                     0
grade                    51
grade_date               51
inspection_type           0
dtype: int64

Total rows: 146


# 7. Data Validation and Cleaning

In [7]:
# Trim whitespace from text columns
text_cols = ['dba', 'street', 'building', 'cuisine description', 'violation description']
for col in text_cols:
    if col in df_copy.columns:
        df_copy[col] = df_copy[col].str.strip()

print("\nData cleaning complete!")
print(f"Current shape: {df_copy.shape}")


Data cleaning complete!
Current shape: (146, 15)


# 8. Check for Duplicates

In [8]:
# Check for duplicate rows
duplicates = df_copy.duplicated().sum()
print(f"Number of duplicate rows: {duplicates:,}")

if duplicates > 0:
    # Remove duplicates, keeping the first occurrence
    before_count = len(df_copy)
    df_copy = df_copy.drop_duplicates()
    print(f"Duplicates removed: {before_count - len(df_copy):,}")
    print(f"Final shape: {df_copy.shape}")
else:
    print("No duplicates found.")

Number of duplicate rows: 0
No duplicates found.


In [9]:
# Analyze key distributions
print("INSPECTION TYPE DISTRIBUTION:")
print(df_copy['inspection_type'].value_counts())

print("\n" + "="*60)
print("GRADE DISTRIBUTION:")
grade_counts = df_copy['grade'].value_counts().sort_index()
print(grade_counts)
print(f"\nGrade missing: {df_copy['grade'].isna().sum():,} ({df_copy['grade'].isna().sum()/len(df_copy)*100:.1f}%)")

print("\n" + "="*60)
print("DATE RANGE:")
print(f"Earliest inspection: {df_copy['inspection_date'].min()}")
print(f"Latest inspection: {df_copy['inspection_date'].max()}")

print("\n" + "="*60)
print("TOP 10 CUISINES:")
print(df_copy['cuisine_description'].value_counts().head(10))

print("\n" + "="*60)
print("BOROUGH DISTRIBUTION:")
print(df_copy['boro'].value_counts())

INSPECTION TYPE DISTRIBUTION:
inspection_type
Cycle Inspection / Initial Inspection       84
Cycle Inspection / Re-inspection            50
Cycle Inspection / Reopening Inspection     11
Cycle Inspection / Compliance Inspection     1
Name: count, dtype: int64

GRADE DISTRIBUTION:
grade
A    63
B     9
C    18
P     4
Z     1
Name: count, dtype: int64

Grade missing: 51 (34.9%)

DATE RANGE:
Earliest inspection: 2016-06-15 00:00:00
Latest inspection: 2025-11-01 00:00:00

TOP 10 CUISINES:
cuisine_description
American          24
Pizza             13
Chinese           12
Latin American    10
Japanese          10
Italian            8
Caribbean          6
Mexican            4
Middle Eastern     4
African            4
Name: count, dtype: int64

BOROUGH DISTRIBUTION:
boro
Manhattan        61
Queens           35
Brooklyn         30
Bronx            14
Staten Island     6
Name: count, dtype: int64


In [10]:
# Analyze key distributions
print("INSPECTION TYPE DISTRIBUTION:")
print(df_copy['inspection_type'].value_counts())

print("\n" + "="*60)
print("GRADE DISTRIBUTION:")
grade_counts = df_copy['grade'].value_counts().sort_index()
print(grade_counts)
print(f"\nGrade missing: {df_copy['grade'].isna().sum():,} ({df_copy['grade'].isna().sum()/len(df_copy)*100:.1f}%)")

print("\n" + "="*60)
print("DATE RANGE:")
print(f"Earliest inspection: {df_copy['inspection_date'].min()}")
print(f"Latest inspection: {df_copy['inspection_date'].max()}")

print("\n" + "="*60)
print("TOP 10 CUISINES:")
print(df_copy['cuisine_description'].value_counts().head(10))

print("\n" + "="*60)
print("BOROUGH DISTRIBUTION:")
print(df_copy['boro'].value_counts())

INSPECTION TYPE DISTRIBUTION:
inspection_type
Cycle Inspection / Initial Inspection       84
Cycle Inspection / Re-inspection            50
Cycle Inspection / Reopening Inspection     11
Cycle Inspection / Compliance Inspection     1
Name: count, dtype: int64

GRADE DISTRIBUTION:
grade
A    63
B     9
C    18
P     4
Z     1
Name: count, dtype: int64

Grade missing: 51 (34.9%)

DATE RANGE:
Earliest inspection: 2016-06-15 00:00:00
Latest inspection: 2025-11-01 00:00:00

TOP 10 CUISINES:
cuisine_description
American          24
Pizza             13
Chinese           12
Latin American    10
Japanese          10
Italian            8
Caribbean          6
Mexican            4
Middle Eastern     4
African            4
Name: count, dtype: int64

BOROUGH DISTRIBUTION:
boro
Manhattan        61
Queens           35
Brooklyn         30
Bronx            14
Staten Island     6
Name: count, dtype: int64


# 10. Export Cleaned Data

In [11]:
# Final summary
print("=" * 60)
print("FINAL CLEANED DATASET")
print("=" * 60)
print(f"Shape: {df_copy.shape}")
print(f"Columns: {list(df_copy.columns)}")
print(f"\nMissing values:")
missing = df_copy.isnull().sum()
print(missing[missing > 0])

print("\nSample:")
print(df_copy.head(3))

# Export to CSV
output_path = '../data/cleaned_restaurant_inspections.csv'
df_copy.to_csv(output_path, index=False)
print(f"\n✓ Exported to: {output_path}")

df_copy.head()

FINAL CLEANED DATASET
Shape: (146, 15)
Columns: ['camis', 'dba', 'boro', 'building', 'street', 'zipcode', 'cuisine_description', 'inspection_date', 'violation_code', 'violation_description', 'critical_flag', 'score', 'grade', 'grade_date', 'inspection_type']

Missing values:
building                  7
zipcode                  54
violation_code           22
violation_description    22
grade                    51
grade_date               51
dtype: int64

Sample:
       camis               dba       boro building            street zipcode  \
12  50070543             CLARO  Manhattan      284          3 AVENUE   10010   
19  41708524  REICHENBACH HALL  Manhattan        5  WEST   37 STREET   10018   
22  41086967           IL SOLE  Manhattan   229233    DYCKMAN STREET    None   

   cuisine_description inspection_date violation_code  \
12             Mexican      2022-09-23            02B   
19              German      2024-05-09            09B   
22             Italian      2024-01-04    

Unnamed: 0,camis,dba,boro,building,street,zipcode,cuisine_description,inspection_date,violation_code,violation_description,critical_flag,score,grade,grade_date,inspection_type
12,50070543,CLARO,Manhattan,284,3 AVENUE,10010.0,Mexican,2022-09-23,02B,Hot TCS food item not held at or above 140 °F.,Critical,7.0,A,2022-09-23,Cycle Inspection / Re-inspection
19,41708524,REICHENBACH HALL,Manhattan,5,WEST 37 STREET,10018.0,German,2024-05-09,09B,Thawing procedure improper.,Not Critical,10.0,A,2024-05-09,Cycle Inspection / Re-inspection
22,41086967,IL SOLE,Manhattan,229233,DYCKMAN STREET,,Italian,2024-01-04,02B,Hot TCS food item not held at or above 140 °F.,Critical,12.0,A,2024-01-04,Cycle Inspection / Initial Inspection
33,50070454,Sushi Q,Bronx,1610,CROSBY AVENUE,10461.0,Japanese,2024-06-24,,,Not Applicable,0.0,P,2024-06-24,Cycle Inspection / Reopening Inspection
38,41486460,RIVER DELI,Brooklyn,2834,COLUMBIA PLACE,,Italian,2024-06-20,04L,Evidence of mice or live mice in establishment...,Critical,9.0,,NaT,Cycle Inspection / Initial Inspection
