# NYC Restaurant Health Inspection Data Cleaning

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests
import os
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 2. Load Dataset

In [None]:
# Make directory for data if it doesn't exist
if not os.path.exists('../data'):
    os.makedirs('../data')


# Load the dataset
file_path = '../data/DOHMH_New_York_City_Restaurant_Inspection_Results_20251119.csv'
df = pd.read_csv(file_path, low_memory=False)

print(df.info())
print(df.describe())
print(df.isnull().sum())


# 3. Explore Dataset

In [None]:
df.head()

# 4. Initial Filtering

Based on the dataset dictionary, we will:
1. **Drop unnecessary columns** not relevant to grade prediction
2. **Remove placeholder inspection dates** (01/01/1900)
3. **Keep only Cycle Inspections** - these are the regular health inspections that result in grades (A/B/C). Other inspection types (Smoke-Free Air Act, Inter-Agency Task Force, etc.) don't produce health grades.

In [None]:
df_copy = df.copy()

# We need to set all columns to lowercase for consistency, and replace spaces with underscores
df_copy.columns = df_copy.columns.str.lower().str.replace(' ', '_')

# Drop unnecessary columns
drop_columns = ['phone', 'action', 'record_date', 'community_board', 'council_district', 
                'census_tract', 'bin', 'bbl', 'nta', 'location', 'latitude', 'longitude']
df_copy = df_copy.drop(columns=drop_columns)

print(f"Original shape: {df_copy.shape}")

# Remove placeholder inspection dates
drop_rows = df_copy[df_copy['inspection_date'] == '01/01/1900'].index
df_copy = df_copy.drop(index=drop_rows)
print(f"After removing placeholder dates: {df_copy.shape} (removed {len(drop_rows):,})")

# Keep only Cycle Inspections (the only ones that produce health grades)
before_count = len(df_copy)
df_copy = df_copy[df_copy['inspection_type'].str.contains('Cycle Inspection', case=False, na=False)]
print(f"After filtering to Cycle Inspections only: {df_copy.shape} (removed {before_count - len(df_copy):,})")

df_copy.head()

# 5. Converting Data Types

In [None]:
# Convert date columns to datetime
# Let pandas infer the date format automatically
df_copy['inspection_date'] = pd.to_datetime(df_copy['inspection_date'])
df_copy['grade_date'] = pd.to_datetime(df_copy['grade_date'], errors='coerce')

# Convert ZIPCODE from float to string (preserve leading zeros)
df_copy['zipcode'] = df_copy['zipcode'].astype('Int64').astype(str).replace('<NA>', None)

# Convert CAMIS to string (it's an ID, not a number)
df_copy['camis'] = df_copy['camis'].astype(str)

print("Data types after conversion:")
print(df_copy.dtypes)

# 6. Check Missing Values

In [None]:
# Check missing values
print("Missing values by column:")
print(df_copy.isnull().sum())
print(f"\nTotal rows: {len(df_copy):,}")

# Note: Some missing grades are expected for initial inspections that haven't been graded yet

# 7. Data Validation and Cleaning

In [None]:
# Trim whitespace from text columns
text_cols = ['dba', 'street', 'building', 'cuisine description', 'violation description']
for col in text_cols:
    if col in df_copy.columns:
        df_copy[col] = df_copy[col].str.strip()

print("\nData cleaning complete!")
print(f"Current shape: {df_copy.shape}")

# 8. Check for Duplicates

In [None]:
# Check for duplicate rows
duplicates = df_copy.duplicated().sum()
print(f"Number of duplicate rows: {duplicates:,}")

if duplicates > 0:
    # Remove duplicates, keeping the first occurrence
    before_count = len(df_copy)
    df_copy = df_copy.drop_duplicates()
    print(f"Duplicates removed: {before_count - len(df_copy):,}")
    print(f"Final shape: {df_copy.shape}")
else:
    print("No duplicates found.")

In [None]:
# Analyze key distributions
print("INSPECTION TYPE DISTRIBUTION:")
print(df_copy['inspection_type'].value_counts())

print("\n" + "="*60)
print("GRADE DISTRIBUTION:")
grade_counts = df_copy['grade'].value_counts().sort_index()
print(grade_counts)
print(f"\nGrade missing: {df_copy['grade'].isna().sum():,} ({df_copy['grade'].isna().sum()/len(df_copy)*100:.1f}%)")

print("\n" + "="*60)
print("DATE RANGE:")
print(f"Earliest inspection: {df_copy['inspection_date'].min()}")
print(f"Latest inspection: {df_copy['inspection_date'].max()}")

print("\n" + "="*60)
print("TOP 10 CUISINES:")
print(df_copy['cuisine_description'].value_counts().head(10))

print("\n" + "="*60)
print("BOROUGH DISTRIBUTION:")
print(df_copy['boro'].value_counts())

In [None]:
# Analyze key distributions
print("INSPECTION TYPE DISTRIBUTION:")
print(df_copy['inspection_type'].value_counts())

print("\n" + "="*60)
print("GRADE DISTRIBUTION:")
grade_counts = df_copy['grade'].value_counts().sort_index()
print(grade_counts)
print(f"\nGrade missing: {df_copy['grade'].isna().sum():,} ({df_copy['grade'].isna().sum()/len(df_copy)*100:.1f}%)")

print("\n" + "="*60)
print("DATE RANGE:")
print(f"Earliest inspection: {df_copy['inspection_date'].min()}")
print(f"Latest inspection: {df_copy['inspection_date'].max()}")

print("\n" + "="*60)
print("TOP 10 CUISINES:")
print(df_copy['cuisine_description'].value_counts().head(10))

print("\n" + "="*60)
print("BOROUGH DISTRIBUTION:")
print(df_copy['boro'].value_counts())

# 10. Export Cleaned Data

In [None]:
# Final summary
print("=" * 60)
print("FINAL CLEANED DATASET")
print("=" * 60)
print(f"Shape: {df_copy.shape}")
print(f"Columns: {list(df_copy.columns)}")
print(f"\nMissing values:")
missing = df_copy.isnull().sum()
print(missing[missing > 0])

print("\nSample:")
print(df_copy.head(3))

# Export to CSV
output_path = '../data/cleaned_restaurant_inspections.csv'
df_copy.to_csv(output_path, index=False)
print(f"\nâœ“ Exported to: {output_path}")

df_copy.head()