In [1]:
import pandas as pd
from slugify import slugify
import os
import glob
import re
import numpy as np
from rapidfuzz import fuzz
from geopy.distance import geodesic
from geopy.geocoders import Nominatim
from time import sleep
import googlemaps
from googlemaps.exceptions import Timeout, TransportError, ApiError

In [None]:
# raw_dir = "../data/raw"
# clean_dir = "../data/clean"
# os.makedirs(clean_dir, exist_ok=True)

gmaps = googlemaps.Client(key="",timeout=5)

 ## Merging two dataset from 2020-2023 and 2024-present

In [3]:
df_2020 = pd.read_csv('../../data/raw/Health_Inspection(2020-2023).csv')
df_2024 = pd.read_csv('../../data/raw/Health_Inspection(2024-Present).csv')
print(df_2020.columns.tolist())
print(df_2024.columns.tolist())

['name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'inspection_id', 'date', 'facility_status', 'inspection_type', 'violation_observed', 'description', 'the_geom']
['inspection_date', 'inspector', 'district', 'subdistrict', 'subsector', 'permit_number', 'dba', 'permit_type', 'street_address', 'street_address_clean', 'inspection_type', 'inspection_frequency_type', 'total_time', 'facility_rating_status', 'census', 'suspension_notes', 'inspection_notes', 'violation_count', 'violation_codes', 'latitude', 'longitude', 'point', 'analysis_neighborhood', 'supervisor_district', 'data_as_of', 'data_loaded_at']


Retaining the features

In [4]:
df_2020 = df_2020.rename(columns={
    'name': 'dba',
    'address': 'street_address',
    'date': 'inspection_date',
    'inspection_type': 'inspection_type',
    'facility_status': 'facility_rating_status',
    'description': 'violation_codes',
    'latitude': 'latitude',
    'longitude': 'longitude'
})

df_2024 = df_2024.rename(columns={
    'dba': 'dba',
    'street_address': 'street_address',
    'inspection_date': 'inspection_date',
    'inspection_type': 'inspection_type',
    'facility_rating_status': 'facility_rating_status',
    'violation_count': 'violation_count',
    'violation_codes': 'violation_codes',
    'latitude': 'latitude',
    'longitude': 'longitude'
})

columns_to_keep = [
    'inspection_date','dba','street_address','inspection_type',
    'facility_rating_status','violation_count','violation_codes',
    'latitude','longitude'
]


df_2020['violation_count'] = np.nan 
df_2020 = df_2020[columns_to_keep]
df_2024 = df_2024[columns_to_keep]
df_all = pd.concat([df_2020, df_2024], ignore_index=True)
df_all=df_all.dropna(subset=['dba'])


df_all = df_all.sort_values(by=['dba','inspection_date']).reset_index(drop=True)


df_all.to_csv("../../data/raw/merged_HealthInspections.csv", index=False)
print("Combined CSV saved")

Combined CSV saved


## Data Cleaning Functions

In [5]:
def strip_string_cells(df: pd.DataFrame) -> pd.DataFrame:
    """Strip whitespace from all string columns"""
    df = df.copy()
    for col in df.select_dtypes(include=["object", "string"]).columns:
        df[col] = df[col].astype(str).str.strip()
    return df

In [6]:
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize column names: lowercase, underscore-separated"""
    df = df.copy()
    df.columns = [
        re.sub(r"\s+", "_", col.strip().lower())
        for col in df.columns
    ]
    return df

In [7]:
def canonicalize_address(addr: str) -> str:
    """Normalize addresses for matching"""
    if pd.isna(addr):
        return ""
    s = str(addr).strip().lower()

    # Remove unit/suite markers
    s = s.replace("#", " ")      
    s = s.replace(" unit ", " ")
    s = s.replace(" ste ", " ")
    s = s.replace(" suite ", " ")

    # Standardize street suffixes
    suffix_map = {
        " st": " street",
        " st,": " street",
        " st.": " street",
        " rd": " road",
        " rd.": " road",
        " ave": " avenue",
        " ave.": " avenue",
        " ave,": " avenue",
        " blvd": " boulevard",
        " dr": " drive",
        " dr.": " drive",
        " plz": " plaza",
        " pl": " place",
        " hwy": " highway",
    }

    for old, new in suffix_map.items():
        if s.endswith(old):
            s = s[: -len(old)] + new
        s = s.replace(old + " ", new + " ")

    # Normalize whitespace and slugify
    s = " ".join(s.split())
    s = slugify(s, lowercase=True, separator=' ')

    return s

In [9]:
def canonicalize_name(name: str) -> str:
    """Normalize restaurant names for matching"""
    if pd.isna(name):
        return ""
    s = str(name).strip().lower()
    
    # Remove common suffixes/prefixes
    patterns_to_remove = [
        r"\s*-\s*walk\s*thru\s*$",
        r"\s*restaurant\s*$",
        r"\s*cafe\s*$",
        r"\s*inc\s*$",
        r"\s*llc\s*$",
        r"\s*\(.*\)\s*",  # Remove parenthetical content
    ]
    
    for pattern in patterns_to_remove:
        s = re.sub(pattern, "", s, flags=re.IGNORECASE)
    
    # Normalize and slugify
    s = " ".join(s.split())
    s = slugify(s, lowercase=True, separator=' ')
    
    return s

In [10]:
def impute_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """Impute missing values in key columns"""
    df = df.copy()
    
    df['facility_rating_status'] = df['facility_rating_status'].fillna('Unknown')
    df['inspection_type'] = df['inspection_type'].fillna('Unknown')
    df['violation_count'] = np.where(df['violation_count'].isnull(), 0, df['violation_count'])
    df['violation_codes'] = np.where(df['violation_codes'].isnull(), '', df['violation_codes'])

    return df

In [11]:
def clean_inspection_df(df: pd.DataFrame) -> pd.DataFrame:
    """Clean and standardize inspection dataframe"""
    df = clean_column_names(df)
    df = strip_string_cells(df)
    df = impute_missing_values(df)

    # Parse dates
    df['inspection_date'] = pd.to_datetime(df['inspection_date'], errors='coerce')
    df['inspection_date'] = df['inspection_date'].dt.strftime('%Y-%m-%d')

    # Convert coordinates to numeric
    for col in ["latitude", "longitude", "lat", "lng"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    col = (
    df["facility_rating_status"]
      .astype(str)
      .str.strip()
      .str.replace(r"\s+", " ", regex=True)
      .str.upper()
)

    # Anything that looks like "*CONDIT*PASS*" becomes "Conditional Pass"
    mask_conditional = col.str.contains(r"COND.*PASS", flags=re.IGNORECASE, na=False)

    df.loc[mask_conditional, "facility_rating_status"] = "Conditional Pass"
    
    df.loc[col.str.upper().eq("PASS"), "facility_rating_status"] = "Pass"
    df.loc[col.str.upper().eq("CLOSURE"), "facility_rating_status"] = "Closure"
    df.loc[col.isin(["NAN", "NONE", ""]), "facility_rating_status"] = np.nan

    df = df.dropna(subset=["facility_rating_status"])

    # Create cleaned address
    source_col = None
    if "street_address" in df.columns:
        source_col = "street_address"
    elif "address" in df.columns:
        source_col = "address"

    if source_col is not None:
        df["Address"] = df[source_col].apply(canonicalize_address)
    
    # Create clean name
    if "dba" in df.columns:
        df["BusinessName"] = df["dba"].apply(canonicalize_name)
    elif "name" in df.columns:
        df["BusinessName"] = df["name"].apply(canonicalize_name)
    df= df.drop(columns=['dba','street_address'])
    new_order = [
    "BusinessName",
    "Address",
    "inspection_date",
    "inspection_type",
    "facility_rating_status",
    "violation_count",
    "violation_codes",
    "latitude",
    "longitude",
    ]

    df = df[new_order]      
    df = df.sort_values(["BusinessName", "Address", "inspection_date"])



    df=df.drop_duplicates(['BusinessName','Address','inspection_date','facility_rating_status'], keep='first').reset_index(drop=True)
    return df

## Load and Clean Individual Datasets

In [12]:
# Load SF Health Inspection Data
df_sf = pd.read_csv('../../data/raw/merged_HealthInspections.csv')
df_sf = clean_inspection_df(df_sf)


print("SF Inspection Data Shape:", df_sf.shape)
print("\nColumns:", df_sf.columns.tolist())
display(df_sf.head(3))

# df_sf.to_csv('../../data/clean/HealthInspections.csv', index=False)


  df['inspection_date'] = pd.to_datetime(df['inspection_date'], errors='coerce')


SF Inspection Data Shape: (32113, 9)

Columns: ['BusinessName', 'Address', 'inspection_date', 'inspection_type', 'facility_rating_status', 'violation_count', 'violation_codes', 'latitude', 'longitude']


Unnamed: 0,BusinessName,Address,inspection_date,inspection_type,facility_rating_status,violation_count,violation_codes,latitude,longitude
0,1 hotel san francisco terrene,8 mission street,2020-03-10,reinspection/followup,Pass,0.0,,,
1,1 hotel san francisco terrene,8 mission street,2022-06-07,routine,Conditional Pass,0.0,"Compliance with variance, specialized processe...",,
2,1 hotel san francisco terrene,8 mission street,2022-06-14,reinspection/followup,Pass,0.0,,,


## Feature Engineering

In [13]:

def calc_trend(x):
    """Slope of last 3 violation counts."""
    x = x.shift().dropna().tail(3)
    if len(x) < 2:
        return np.nan
    return (x.iloc[-1] - x.iloc[0]) / (len(x) - 1)

geo_cache = {}

def geocode_google(address):
    try:
        result = gmaps.geocode(f"{address}, San Francisco, CA")
        if result:
            loc = result[0]["geometry"]["location"]
            return loc["lat"], loc["lng"]
    except Timeout:
        print(f"Timeout while geocoding: {address}")
    except ApiError as e:
        print(f"API error for {address}: {e}")
    except TransportError as e:
        print(f"Transport error for {address}: {e}")
    except Exception as e:
        print(f"Unexpected error for {address}: {e}")
    return None, None

def geocode_google_cached(address):
    if address in geo_cache:
        return geo_cache[address]

    lat, lng = geocode_google(address)
    if lat is not None and lng is not None:
        geo_cache[address] = (lat, lng)
    return lat, lng


def feature_engineer(df):
   
    df = df.copy()

    # Ensure correct types
    df["inspection_date"] = pd.to_datetime(df["inspection_date"], errors="coerce")
    df["violation_count"] = pd.to_numeric(df["violation_count"], errors="coerce").fillna(0)

    # Sort properly
    df = df.sort_values(["BusinessName", "Address", "inspection_date"])

    # Fail flag
    df["failFlag"] = df["facility_rating_status"].apply(
        lambda x: 1 if x in ["Closure", "Conditional Pass"] else 0
    )

    # Avg violations
    df["avg_violations_last_3"] = (
        df.groupby(["BusinessName", "Address"])["violation_count"]
          .transform(lambda x: x.shift().rolling(3, min_periods=1).mean())
    )

    # Fail rate
    df["fail_rate_last_3"] = (
        df.groupby(["BusinessName", "Address"])["failFlag"]
          .transform(lambda x: x.shift().rolling(3, min_periods=1).mean())
    )

    # Previous inspection date
    df["prev_inspection_date"] = (
        df.groupby(["BusinessName", "Address"])["inspection_date"].shift()
    )

    # Days since last inspection
    df["days_since_last_inspection"] = (
        df["inspection_date"] - df["prev_inspection_date"]
    ).dt.days

    # Trend
    df["trend_last_3"] = (
        df.groupby(["BusinessName", "Address"])["violation_count"]
          .transform(calc_trend)
    )

    df["avg_violations_last_3"] = df["avg_violations_last_3"].fillna(0)
    df["fail_rate_last_3"] = df["fail_rate_last_3"].fillna(0)
    df["trend_last_3"] = df["trend_last_3"].fillna(0)

    # Use -1 to explicitly mark "first inspection"
    df["days_since_last_inspection"] = df["days_since_last_inspection"].fillna(-1)
    

    return df


## Donot run this - api 

In [None]:

# df_sf = feature_engineer(df_sf)

missing = df_sf[df_sf["latitude"].isna() | df_sf["longitude"].isna()]
# # print(missing)
unique_addresses = missing["Address"].dropna().unique()
# print(unique_addresses)
batch_size = 200           
pause_between_batches = 5  
sleep_between_calls = 0.1  #
count=0
results = {}

for start in range(0, len(unique_addresses), batch_size):
    batch = unique_addresses[start:start + batch_size]
    print(f"Batch {start}–{start+len(batch)-1} / {len(unique_addresses)}")

    for i, addr in enumerate(batch, start=1):
        if addr in results:
            continue
        lat, lng = geocode_google_cached(addr) 
        results[addr] = (lat, lng)

        if i % 20 == 0:
            print(f"  {i}/{len(batch)} in this batch")

        sleep(sleep_between_calls)

    # persist batch progress to disk so i can resume later
    pd.DataFrame(
        [(a, *results[a]) for a in results],
        columns=["Address", "lat", "lng"]
    ).to_csv("geocode_cache.csv", index=False)

    print("  Pausing between batches...")
    sleep(pause_between_batches)

# print(f"Geocoded {count} addresses.")
print("Geocoding complete.")
cache_df = pd.read_csv("geocode_cache.csv")
cache_map_lat = dict(zip(cache_df["Address"], cache_df["lat"]))
cache_map_lng = dict(zip(cache_df["Address"], cache_df["lng"]))

df_sf.loc[missing.index, "latitude"] = missing["Address"].map(cache_map_lat)
df_sf.loc[missing.index, "longitude"] = missing["Address"].map(cache_map_lng)
# df_sf.to_csv('../../data/clean/HealthInspections.csv', index=False)
df_sf.head(5)


Batch 0–199 / 3139
  20/200 in this batch
  40/200 in this batch
  60/200 in this batch
  80/200 in this batch
  100/200 in this batch
  120/200 in this batch
  140/200 in this batch
  160/200 in this batch
  180/200 in this batch
  200/200 in this batch
  Pausing between batches...
Batch 200–399 / 3139
  20/200 in this batch
  40/200 in this batch
  60/200 in this batch
  80/200 in this batch
  100/200 in this batch
  120/200 in this batch
  140/200 in this batch
  160/200 in this batch
  180/200 in this batch
  200/200 in this batch
  Pausing between batches...
Batch 400–599 / 3139
  20/200 in this batch
  40/200 in this batch
  60/200 in this batch
  80/200 in this batch
  100/200 in this batch
  120/200 in this batch
  140/200 in this batch
  160/200 in this batch
  180/200 in this batch
  200/200 in this batch
  Pausing between batches...
Batch 600–799 / 3139
  20/200 in this batch
  40/200 in this batch
  60/200 in this batch
  80/200 in this batch
  100/200 in this batch
  120/2

Unnamed: 0,BusinessName,Address,inspection_date,inspection_type,facility_rating_status,violation_count,violation_codes,latitude,longitude,failFlag,avg_violations_last_3,fail_rate_last_3,prev_inspection_date,days_since_last_inspection,trend_last_3
0,1 hotel san francisco terrene,8 mission street,2020-03-10,reinspection/followup,Pass,0.0,,37.793562,-122.393087,0,0.0,0.0,NaT,-1.0,0.0
1,1 hotel san francisco terrene,8 mission street,2022-06-07,routine,Conditional Pass,0.0,"Compliance with variance, specialized processe...",37.793562,-122.393087,1,0.0,0.0,2020-03-10,819.0,0.0
2,1 hotel san francisco terrene,8 mission street,2022-06-14,reinspection/followup,Pass,0.0,,37.793562,-122.393087,0,0.0,0.5,2022-06-07,7.0,0.0
3,1 hotel san francisco terrene,8 mission street,2023-01-12,routine,Pass,0.0,Adequate hand washing facilities; supplied and...,37.793562,-122.393087,0,0.0,0.333333,2022-06-14,212.0,0.0
4,1 skyline bar in seat,24 willie mays plaza 113,2025-04-03,New Ownership (I),Pass,0.0,,37.77813,-122.391855,0,0.0,0.0,NaT,-1.0,0.0


In [17]:
df_sf=df_sf.dropna(subset=['latitude','longitude'])
df_sf=df_sf.drop(columns=["prev_inspection_date"])
df_sf.to_csv('../../data/clean/HealthInspections.csv', index=False)

In [19]:
df_sf["facility_rating_status"].value_counts(dropna=False)
# print(df_sf.columns)

facility_rating_status
Pass                29822
Conditional Pass     1424
Closure               851
Name: count, dtype: int64

 ## For Merging Google Data

In [9]:
# Load Google Restaurant Data
google_path = os.path.join(raw_dir, "sf_restaurants_google.csv")
df_google = pd.read_csv(google_path, encoding="latin1")
df_google = clean_inspection_df(df_google)

print("Google Restaurant Data Shape:", df_google.shape)
print("\nColumns:", df_google.columns.tolist())
display(df_google.head(3))

Google Restaurant Data Shape: (1970, 13)

Columns: ['name', 'address', 'rating', 'user_ratings_total', 'types', 'open_now', 'lat', 'lng', 'place_id', 'business_status', 'reviews', 'canonical_address', 'canonical_name']


Unnamed: 0,name,address,rating,user_ratings_total,types,open_now,lat,lng,place_id,business_status,reviews,canonical_address,canonical_name
0,Piccolo Forno,"725 Columbus Ave, San Francisco, CA 94133, Uni...",4.7,2013.0,"restaurant, food, point_of_interest, establish...",False,37.80138,-122.411986,ChIJLeYyO_CAhYAR6j9XsEIm3GI,OPERATIONAL,"[{'author': 'William Irwin', 'rating': 5, 'tex...",725 columbus avenue san francisco ca 94133 uni...,piccolo forno
1,Bottega,"1132 Valencia St, San Francisco, CA 94110, Uni...",4.4,990.0,"bar, store, restaurant, food, point_of_interes...",False,37.754715,-122.421194,ChIJxZMBuHV_j4ARK1fO7KSqyOE,OPERATIONAL,"[{'author': 'Moorissa Tjokro', 'rating': 4, 't...",1132 valencia street san francisco ca 94110 un...,bottega
2,Cotogna,"490 Pacific Ave, San Francisco, CA 94133, Unit...",4.5,2080.0,"restaurant, food, point_of_interest, establish...",False,37.797399,-122.403576,ChIJA0YvGPWAhYAReXmaDTTdWzU,OPERATIONAL,"[{'author': 'Pavlo Debelynskyi', 'rating': 5, ...",490 pacific avenue san francisco ca 94133 unit...,cotogna


In [10]:
# Save cleaned individual datasets
df_sf.to_csv(os.path.join(clean_dir, "sfData_cleaned.csv"), index=False)
df_google.to_csv(os.path.join(clean_dir, "googleData_cleaned.csv"), index=False)

print("Cleaned datasets saved successfully.")


Cleaned datasets saved successfully.


## Matching Functions

In [11]:
def calculate_geo_distance(lat1, lon1, lat2, lon2):
    """Calculate distance in meters between two coordinates"""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan
    try:
        return geodesic((lat1, lon1), (lat2, lon2)).meters
    except:
        return np.nan

In [12]:
def find_best_match(sf_row, df_google, use_geo=True, geo_threshold_m=100, 
                    name_threshold=80, address_threshold=85):
    """
    Find best matching Google restaurant for an SF inspection record.
    
    Matching strategy:
    1. Geographic filter: Only consider restaurants within geo_threshold_m meters
    2. Score combinations of name and address similarity
    3. Return best match above thresholds
    """
    candidates = df_google.copy()
    
    # Filter by geographic proximity if coordinates available
    if use_geo and not pd.isna(sf_row['latitude']) and not pd.isna(sf_row['longitude']):
        candidates['geo_distance'] = candidates.apply(
            lambda x: calculate_geo_distance(
                sf_row['latitude'], sf_row['longitude'],
                x['lat'], x['lng']
            ), axis=1
        )
        candidates = candidates[candidates['geo_distance'] <= geo_threshold_m]
        
        if len(candidates) == 0:
            return None, None, None
    
    # Calculate name similarity
    candidates['name_score'] = candidates['canonical_name'].apply(
        lambda x: fuzz.ratio(sf_row['canonical_name'], x) if pd.notna(x) else 0
    )
    
    # Calculate address similarity
    candidates['address_score'] = candidates['canonical_address'].apply(
        lambda x: fuzz.ratio(sf_row['canonical_address'], x) if pd.notna(x) else 0
    )
    
    # Combined score: weighted average
    candidates['combined_score'] = (
        0.6 * candidates['name_score'] + 
        0.4 * candidates['address_score']
    )
    
    # Find best match
    best_idx = candidates['combined_score'].idxmax()
    best_match = candidates.loc[best_idx]
    
    # Apply thresholds
    if (best_match['name_score'] >= name_threshold or 
        best_match['address_score'] >= address_threshold):
        return best_match['place_id'], best_match['combined_score'], best_idx
    
    return None, None, None

## Merge Datasets

In [13]:
# Prepare SF data for matching
df_sf_subset = df_sf[[
    'permit_number', 'dba', 'canonical_name', 'street_address', 
    'canonical_address', 'latitude', 'longitude', 'inspection_date',
    'facility_rating_status', 'violation_count'
]].copy()

# Initialize matching columns
df_sf_subset['matched_place_id'] = None
df_sf_subset['match_score'] = None

print(f"Starting matching process for {len(df_sf_subset)} SF records...")

Starting matching process for 19056 SF records...


In [14]:
# Perform matching with progress tracking
from tqdm import tqdm

matched_count = 0

for idx, row in tqdm(df_sf_subset.iterrows(), total=len(df_sf_subset)):
    place_id, score, match_idx = find_best_match(
        row, df_google, 
        use_geo=True, 
        geo_threshold_m=150,
        name_threshold=75,
        address_threshold=80
    )
    
    if place_id is not None:
        df_sf_subset.at[idx, 'matched_place_id'] = place_id
        df_sf_subset.at[idx, 'match_score'] = score
        matched_count += 1

print(f"\nMatched {matched_count} out of {len(df_sf_subset)} records")
print(f"Match rate: {matched_count / len(df_sf_subset) * 100:.2f}%")

100%|██████████| 19056/19056 [56:17<00:00,  5.64it/s]


Matched 3024 out of 19056 records
Match rate: 15.87%





In [15]:
# Merge with Google data
df_merged = df_sf_subset.merge(
    df_google[['place_id', 'name', 'address', 'rating', 'user_ratings_total', 
               'types', 'lat', 'lng', 'business_status']],
    left_on='matched_place_id',
    right_on='place_id',
    how='left',
    suffixes=('_sf', '_google')
)

print("Merged Dataset Shape:", df_merged.shape)
display(df_merged.head(100))

Merged Dataset Shape: (19056, 21)


Unnamed: 0,permit_number,dba,canonical_name,street_address,canonical_address,latitude,longitude,inspection_date,facility_rating_status,violation_count,...,match_score,place_id,name,address,rating,user_ratings_total,types,lat,lng,business_status
0,06734928,Surfside - Walk Thru,surfside,24 WILLIE MAYS PLZ # PROMEN,3rd street king street,37.778130,-122.391855,2025-04-23,Pass,,...,,,,,,,,,,
1,06735187,HARBOR EMPEROR,harbor emperor,41 EMBARCADERO,41 embarcadero,37.787126,-122.387925,2025-04-23,Conditional Pass,4.0,...,,,,,,,,,,
2,06743776,RA @ BLOOMBERG FLOOR 22,ra bloomberg floor 22,140 NEW MONTGOMERY ST FL 22,140 new montgomery street,37.786617,-122.400018,2025-04-23,Pass,,...,,,,,,,,,,
3,102419,MOKUKU,mokuku,332 CLEMENT ST,332 clement street,37.783269,-122.462932,2025-04-23,Pass,,...,,,,,,,,,,
4,06734548,STEEP CREAMERY & TEA - SECTION 110,steep creamery tea section 110,24 WILLIE MAYS PLZ,3rd street king street,37.778130,-122.391855,2025-04-23,Pass,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,A99312,LOST AND FOUND SF,lost and found sf,1439 TARAVAL ST,1439 taraval street,37.742544,-122.481752,2025-04-09,Closure,3.0,...,,,,,,,,,,
96,18306,FERRY PLAZA FARMERS MARKET,ferry plaza farmers market,1 FERRY BUILDING 50,the embarcadero market street,37.794935,-122.394265,2025-04-08,Pass,,...,,,,,,,,,,
97,88498,MCDONALD'S,mcdonald s,441 SUTTER ST,441 sutter street,37.789065,-122.407650,2025-04-08,Pass,2.0,...,79.15493,ChIJRagBt46AhYARJim-yIO5cyo,McDonald's,"441 Sutter St, San Francisco, CA 94108, United...",3.8,2527.0,"cafe, store, restaurant, food, point_of_intere...",37.789217,-122.407641,OPERATIONAL
98,51055,LUCKY 756,lucky 756,1750 FULTON ST,1750 fulton street,37.776168,-122.445792,2025-04-08,Pass,,...,,,,,,,,,,


In [16]:
# Analyze match quality
print("Match Quality Analysis:")
print(f"Total records: {len(df_merged)}")
print(f"Matched records: {df_merged['matched_place_id'].notna().sum()}")
print(f"Unmatched records: {df_merged['matched_place_id'].isna().sum()}")
print(f"\nMatch Score Statistics:")
print(df_merged['match_score'].describe())

Match Quality Analysis:
Total records: 19056
Matched records: 3024
Unmatched records: 16032

Match Score Statistics:
count     3024.000000
unique     326.000000
top         78.550725
freq       453.000000
Name: match_score, dtype: float64


In [17]:
# Save merged dataset
output_path = os.path.join(clean_dir, "merged_sf_google_data.csv")
df_merged.to_csv(output_path, index=False)
print(f"Merged dataset saved to: {output_path}")

Merged dataset saved to: ../data/clean\merged_sf_google_data.csv


## Explore Merged Data

In [28]:
# Sample matched records

matched_sample = df_merged[df_merged['matched_place_id'].notna()].head(50)
display(matched_sample[['dba', 'name', 'rating', 'facility_rating_status', 
                        'violation_count', 'match_score']])


Unnamed: 0,dba,name,rating,facility_rating_status,violation_count,match_score
10,KHAO TIEW,Khao Tiew,4.6,Pass,5.0,80.512821
19,SUM DIM SUM,Sum Dim Sum,3.8,Pass,2.0,78.550725
22,TIN ON KEARNY,Tin on Kearny,4.9,Pass,7.0,79.15493
31,HAPPY KING PIZZA,Happy king pizza,4.6,Pass,5.0,77.910448
42,HOUSE OF PRIME RIB,House of Prime Rib,4.7,,,80.779221
60,LA VACA BIRRIA,La Vaca Birria,4.4,Pass,6.0,78.550725
65,ZITOUNA,Zitouna,4.8,Pass,,80.266667
68,MATADOR,Matador,4.3,Pass,5.0,79.15493
75,FRANCO'S LATIN TABLE,Franco's Latin Table,4.3,Pass,,78.947368
77,TRISARA RESTAURANT,Trisara,4.7,Pass,,79.15493
