In [46]:
import pandas as pd
from slugify import slugify
import os
import glob
import re


In [47]:
df_raw = pd.read_csv("../data/raw/Health_Inspection_Scores_(2024-Present)_20251030.csv")
# os.getcwd()
print(df_raw.columns.tolist())
display(df_raw.head(2))



['inspection_date', 'inspector', 'district', 'subdistrict', 'subsector', 'permit_number', 'dba', 'permit_type', 'street_address', 'street_address_clean', 'inspection_type', 'inspection_frequency_type', 'total_time', 'facility_rating_status', 'census', 'suspension_notes', 'inspection_notes', 'violation_count', 'violation_codes', 'latitude', 'longitude', 'point', 'analysis_neighborhood', 'supervisor_district', 'data_as_of', 'data_loaded_at']


Unnamed: 0,inspection_date,inspector,district,subdistrict,subsector,permit_number,dba,permit_type,street_address,street_address_clean,...,inspection_notes,violation_count,violation_codes,latitude,longitude,point,analysis_neighborhood,supervisor_district,data_as_of,data_loaded_at
0,2025/04/23 12:00:00 AM,Michael Mooney,1,103,607,6734928,Surfside - Walk Thru,H36 - STADIUM CONCESSIONS (PERM),24 WILLIE MAYS PLZ # PROMEN,3RD ST & KING ST,...,,,,37.77813,-122.391855,POINT (-122.391855 37.77813),Mission Bay,6.0,2025/07/01 10:09:15 AM,2025/10/30 02:37:07 AM
1,2025/04/23 12:00:00 AM,Michael Mooney,2,201,106,6735187,HARBOR EMPEROR,H33 - COMMISSARIES,41 EMBARCADERO,41 EMBARCADERO,...,,4.0,"113953(c), 114163(a)(3), 114189, 114192.1, 114...",37.787126,-122.387925,POINT (-122.387924588 37.787126305),Financial District/South Beach,6.0,2025/07/01 10:09:15 AM,2025/10/30 02:37:07 AM


In [48]:

raw_dir = "../data/raw"
clean_dir = "../data/clean"

os.makedirs(clean_dir, exist_ok=True)
csv_paths = glob.glob(os.path.join(raw_dir, "*.csv"))
print(csv_paths)

['../data/raw\\Health_Inspection_Scores_(2024-Present)_20251030.csv', '../data/raw\\sf_restaurants_google.csv']


In [49]:
def strip_string_cells(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in df.select_dtypes(include=["object", "string"]).columns:
        df[col] = df[col].astype(str).str.strip()
    return df

In [50]:
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Make all column names:
    - stripped (no leading/trailing spaces)
    - lowercase
    - spaces replaced with underscore
    """
    df = df.copy()
    df.columns = [
        re.sub(r"\s+", "_", col.strip().lower())
        for col in df.columns
    ]
    return df

In [51]:
def canonicalize_address(addr: str) -> str:
    """
    Robust normalization of addresses:
    - lowercase
    - replace all common street suffixes (even if at end of string)
    - remove punctuation and normalize via slugify
    - remove unit/suite markers for matching
    """
    if pd.isna(addr):
        return ""
    s = str(addr).strip().lower()

    
    s = s.replace("#", " ")      
    s = s.replace(" unit ", " ")
    s = s.replace(" ste ", " ")
    s = s.replace(" suite ", " ")

    suffix_map = {
        " st": " street",
        " st,": " street",
        " st.": " street",
        " rd": " road",
        " rd.": " road",
        " ave": " avenue",
        " ave.": " avenue",
        " ave,": " avenue",
        " blvd": " boulevard",
        " dr": " drive",
        " dr.": " drive",
        " plz": " plaza",
        " pl": " place",
        " hwy": " highway",
        " street": " street",
        " road": " road",
        " place": " place",
    }

    
    for old, new in suffix_map.items():
        if s.endswith(old):
            s = s[: -len(old)] + new
        s = s.replace(old + " ", new + " ")
        s = s.replace(" " + old + " ", " " + new + " ")

    
    s = " ".join(s.split())

    
    s = slugify(s, lowercase=True, separator=' ')

    return s



# df_raw["canonical_name"] = df_raw["dba"].apply(canonicalize_address)
# df_raw["canonical_address"] = df_raw["street_address"].apply(canonicalize_address)

# # show a preview
# df_raw[["dba", "canonical_name", "street_address_clean", "canonical_address"]].head(8)


In [52]:
def clean_inspection_df(df: pd.DataFrame) -> pd.DataFrame:
    df = clean_column_names(df)
    df = strip_string_cells(df)

    # parse dates
    date_cols = ["inspection_date", "data_as_of", "data_loaded_at"]
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")

    # numeric coords
    for col in ["latitude", "longitude"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    
    source_col = None
    if "street_address_clean" in df.columns:
        source_col = "street_address_clean"
    elif "address" in df.columns:
        source_col = "address"

    if source_col is not None:
        df["canonical_address"] = df[source_col].apply(canonicalize_address)

    return df

In [53]:
name_map = {
    "Health_Inspection_Scores_(2024-Present)_20251030.csv": "sfData_cleaned.csv",
    "sf_restaurants_google.csv": "googleData_cleaned.csv",
}

for path in glob.glob(os.path.join(raw_dir, "*.csv")):
    filename = os.path.basename(path)
    print("Cleaning:", filename)

    df = pd.read_csv(path, encoding="latin1")  # adjust if needed
    df_clean = clean_inspection_df(df)

    out_name = name_map.get(filename, f"{os.path.splitext(filename)[0]}.csv")
    out_path = os.path.join(clean_dir, out_name)

    df_clean.to_csv(out_path, index=False)
    print("  -> saved to", out_path)

Cleaning: Health_Inspection_Scores_(2024-Present)_20251030.csv


  df[col] = pd.to_datetime(df[col], errors="coerce")


  -> saved to ../data/clean\sfData_cleaned.csv
Cleaning: sf_restaurants_google.csv
  -> saved to ../data/clean\googleData_cleaned.csv
