# NYC Restaurant Health Inspection Data Cleaning

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests
import os
import time
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 2. Load Dataset

In [None]:
# =============================================================================
# API Configuration
# =============================================================================
API_ENDPOINT = "https://data.cityofnewyork.us/api/v3/views/43nn-pn8j/query.csv"
MAX_RETRIES = 3
RETRY_DELAY_SECONDS = 5
REQUEST_TIMEOUT_SECONDS = 300  # 5 min timeout for large CSV download

# Cache configuration
CACHE_DIR = "../data"
CACHE_FILENAME = "nyc_restaurant_inspections_cached.csv"
CACHE_PATH = os.path.join(CACHE_DIR, CACHE_FILENAME)

In [None]:
def get_app_token(token=None):
    """
    Get NYC Open Data API token.
    
    Priority order:
    1. Directly passed token parameter
    2. NYC_OPENDATA_APP_TOKEN environment variable
    3. .env file in parent directory (for local testing)
        If using .end file, create a file at ../data/.env with the content:
            NYC_OPENDATA_APP_TOKEN=your_token_here
    4. Raise error if not found
    """
    # 1. Direct token
    if token:
        return token
    
    # 2. Environment variable
    env_token = os.environ.get('NYC_OPENDATA_APP_TOKEN')
    if env_token:
        return env_token
    
    # 3. Try .env file for local testing
    env_file = os.path.join(CACHE_DIR, '.env')
    if os.path.exists(env_file):
        with open(env_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line.startswith('NYC_OPENDATA_APP_TOKEN='):
                    return line.split('=', 1)[1].strip().strip('"').strip("'")
    
    raise ValueError(
        "NYC Open Data API token not found.\n\n"
        "Set it using one of these methods:\n"
        "  1. Pass directly: load_restaurant_data(token='your_token')\n"
        "  2. Environment variable: export NYC_OPENDATA_APP_TOKEN='your_token'\n"
        "  3. Create ../data/.env file with: NYC_OPENDATA_APP_TOKEN=your_token"
    )


def fetch_all_restaurant_data(app_token):
    """Fetch all restaurant inspection data from NYC Open Data API as CSV."""
    from io import StringIO
    
    print("Fetching data from NYC Open Data API...")
    print(f"Endpoint: {API_ENDPOINT}")
    print("-" * 50)
    
    headers = {
        'X-App-Token': app_token
    }
    
    # For CSV endpoint, use GET with query params
    params = {
        '$limit': 500000,  # Get all records
    }
    
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.get(
                API_ENDPOINT,
                headers=headers,
                params=params,
                timeout=REQUEST_TIMEOUT_SECONDS
            )
            response.raise_for_status()
            break
        except requests.exceptions.RequestException as e:
            if attempt < MAX_RETRIES - 1:
                wait_time = RETRY_DELAY_SECONDS * (2 ** attempt)
                print(f"  Request failed: {e}. Retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                raise
    
    # Parse CSV response
    df = pd.read_csv(StringIO(response.text))
    
    print(f"Fetch complete! Total records: {len(df):,}")
    print("-" * 50)
    
    return df

In [None]:
def load_restaurant_data(force_refresh=False, token=None):
    """
    Load restaurant inspection data with caching.
    
    If cached data exists locally, load from cache.
    Otherwise, fetch from NYC Open Data API and cache locally.
    
    Args:
        force_refresh: If True, fetch fresh data from API even if cache exists
        token: Optional API token for local testing (bypasses env var lookup)
        
    Returns:
        pd.DataFrame: Restaurant inspection data
    """
    # Ensure data directory exists
    if not os.path.exists(CACHE_DIR):
        os.makedirs(CACHE_DIR)
    
    # Check for cached data
    if not force_refresh and os.path.exists(CACHE_PATH):
        print(f"Loading data from cache: {CACHE_PATH}")
        df = pd.read_csv(CACHE_PATH, low_memory=False)
        print(f"Loaded {len(df):,} records from cache")
        print("Set force_refresh=True to fetch fresh data from API")
        return df
    
    # Fetch from API
    print("No cached data found. Fetching from API...")
    app_token = get_app_token(token)
    df = fetch_all_restaurant_data(app_token)
    
    # Save to cache
    df.to_csv(CACHE_PATH, index=False)
    print(f"\nData cached to: {CACHE_PATH}")
    print(f"Cache size: {os.path.getsize(CACHE_PATH) / (1024*1024):.1f} MB")
    
    return df

In [None]:
# Load data from NYC Open Data API (or cache if available)
# Set force_refresh=True to re-download latest data from API
df = load_restaurant_data(force_refresh=False)

print("\n" + "=" * 60)
print("DATA LOADED SUCCESSFULLY")
print("=" * 60)
print(f"\nDataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")

# 3. Explore Dataset

In [None]:
df.head()

# 4. Initial Filtering

Based on the dataset dictionary, we will:
1. **Drop unnecessary columns** not relevant to grade prediction
2. **Remove placeholder inspection dates** (01/01/1900)
3. **Keep only Cycle Inspections** - these are the regular health inspections that result in grades (A/B/C). Other inspection types (Smoke-Free Air Act, Inter-Agency Task Force, etc.) don't produce health grades.

In [None]:
df_copy = df.copy()

# We need to set all columns to lowercase for consistency, and replace spaces with underscores
df_copy.columns = df_copy.columns.str.lower().str.replace(' ', '_')

# Drop unnecessary columns (keep latitude, longitude for map display)
drop_columns = ['phone', 'action', 'record_date', 'community_board', 'council_district', 
                'census_tract', 'bin', 'bbl', 'nta', 'location']
df_copy = df_copy.drop(columns=drop_columns)

print(f"Original shape: {df_copy.shape}")

# Remove placeholder inspection dates
drop_rows = df_copy[df_copy['inspection_date'] == '01/01/1900'].index
df_copy = df_copy.drop(index=drop_rows)
print(f"After removing placeholder dates: {df_copy.shape} (removed {len(drop_rows):,})")

# Keep only Cycle Inspections (the only ones that produce health grades)
before_count = len(df_copy)
df_copy = df_copy[df_copy['inspection_type'].str.contains('Cycle Inspection', case=False, na=False)]
print(f"After filtering to Cycle Inspections only: {df_copy.shape} (removed {before_count - len(df_copy):,})")

df_copy.head()

# 5. Converting Data Types

In [None]:
# Convert date columns to datetime
# Let pandas infer the date format automatically
df_copy['inspection_date'] = pd.to_datetime(df_copy['inspection_date'])
df_copy['grade_date'] = pd.to_datetime(df_copy['grade_date'], errors='coerce')

# Convert ZIPCODE: clean numeric conversion, then to 5-digit string
# First convert to numeric (handles floats like 10001.0), then to int, then to zero-padded string
df_copy['zipcode'] = pd.to_numeric(df_copy['zipcode'], errors='coerce')
df_copy['zipcode'] = df_copy['zipcode'].apply(
    lambda x: str(int(x)).zfill(5) if pd.notna(x) else None
)

# Convert CAMIS to string (it's an ID, not a number)
df_copy['camis'] = df_copy['camis'].astype(str)

print("Data types after conversion:")
print(df_copy.dtypes)
print(f"\nZipcode sample: {df_copy['zipcode'].dropna().head(10).tolist()}")

# 6. Check Missing Values

In [None]:
# Check missing values
print("Missing values by column:")
print(df_copy.isnull().sum())
print(f"\nTotal rows: {len(df_copy):,}")

# Note: Some missing grades are expected for initial inspections that haven't been graded yet

# 7. Data Validation and Cleaning

In [None]:
# Trim whitespace from text columns
text_cols = ['dba', 'street', 'building', 'cuisine_description', 'violation_description']
for col in text_cols:
    if col in df_copy.columns:
        df_copy[col] = df_copy[col].str.strip()

# Filter to valid NYC coordinates (remove nulls and invalid 0,0 values)
before_count = len(df_copy)
df_copy = df_copy.dropna(subset=['latitude', 'longitude'])
df_copy = df_copy[(df_copy['latitude'] > 40) & (df_copy['latitude'] < 42) &
                  (df_copy['longitude'] < -73) & (df_copy['longitude'] > -75)]
print(f"After filtering invalid coordinates: {df_copy.shape} (removed {before_count - len(df_copy):,})")

print("\nData cleaning complete!")
print(f"Current shape: {df_copy.shape}")

# 8. Check for Duplicates

In [None]:
# Check for duplicate rows
duplicates = df_copy.duplicated().sum()
print(f"Number of duplicate rows: {duplicates:,}")

if duplicates > 0:
    # Remove duplicates, keeping the first occurrence
    before_count = len(df_copy)
    df_copy = df_copy.drop_duplicates()
    print(f"Duplicates removed: {before_count - len(df_copy):,}")
    print(f"Final shape: {df_copy.shape}")
else:
    print("No duplicates found.")

In [None]:
# Analyze key distributions
print("INSPECTION TYPE DISTRIBUTION:")
print(df_copy['inspection_type'].value_counts())

print("\n" + "="*60)
print("GRADE DISTRIBUTION:")
grade_counts = df_copy['grade'].value_counts().sort_index()
print(grade_counts)
print(f"\nGrade missing: {df_copy['grade'].isna().sum():,} ({df_copy['grade'].isna().sum()/len(df_copy)*100:.1f}%)")

print("\n" + "="*60)
print("DATE RANGE:")
print(f"Earliest inspection: {df_copy['inspection_date'].min()}")
print(f"Latest inspection: {df_copy['inspection_date'].max()}")

print("\n" + "="*60)
print("TOP 10 CUISINES:")
print(df_copy['cuisine_description'].value_counts().head(10))

print("\n" + "="*60)
print("BOROUGH DISTRIBUTION:")
print(df_copy['boro'].value_counts())

# 10. Export Cleaned Data

In [None]:
# Final summary
print("=" * 60)
print("FINAL CLEANED DATASET")
print("=" * 60)
print(f"Shape: {df_copy.shape}")
print(f"Columns: {list(df_copy.columns)}")
print(f"\nMissing values:")
missing = df_copy.isnull().sum()
print(missing[missing > 0])

print("\nSample:")
print(df_copy.head(3))

# Export to CSV
output_path = '../data/cleaned_restaurant_inspections.csv'
df_copy.to_csv(output_path, index=False)
print(f"\nâœ“ Exported to: {output_path}")

df_copy.head()