In [None]:
import pandas as pd, io, os, re
from google.colab import files

# ------------------------------------------------------------
# 1) Upload the FULL dataset  (Ceramic analysis FS Louie …)
# ------------------------------------------------------------
print("⬆️  Select the FULL dataset (Ceramic analysis FS Louie …)")
full_bytes = files.upload()
full_name  = next(iter(full_bytes))
full_df    = pd.read_excel(io.BytesIO(full_bytes[full_name])) \
             if full_name.endswith((".xlsx", ".xls")) else \
             pd.read_csv(io.BytesIO(full_bytes[full_name]))
print(f"✅ Loaded FULL sheet: {full_df.shape[0]} rows")

# ------------------------------------------------------------
# 2) Upload the 1960 catalog  (1960 restaurants …)
# ------------------------------------------------------------
print("\n⬆️  Now select the 1960 catalog file")
cat_bytes  = files.upload()
cat_name   = next(iter(cat_bytes))
cat_df     = pd.read_excel(io.BytesIO(cat_bytes[cat_name])) \
             if cat_name.endswith((".xlsx", ".xls")) else \
             pd.read_csv(io.BytesIO(cat_bytes[cat_name]))
print(f"✅ Loaded catalog: {cat_df.shape[0]} rows")

# ------------------------------------------------------------
# 3) Exact column names
# ------------------------------------------------------------
FULL_REST_COL = "Restuarant Name"          # note the original spelling
FULL_FLAG_COL = "Is it in the 1960 Catalog?"
CAT_REST_COL  = "name"

for col in [FULL_REST_COL, FULL_FLAG_COL]:
    if col not in full_df.columns:
        raise KeyError(f'"{col}" missing in full dataset')
if CAT_REST_COL not in cat_df.columns:
    raise KeyError(f'"{CAT_REST_COL}" missing in catalog sheet')

# ------------------------------------------------------------
# 4) Build lookup set (lower-cased, trimmed)
# ------------------------------------------------------------
catalog_set = set(
    cat_df[CAT_REST_COL].astype(str).str.strip().str.lower().dropna().unique()
)
print(f"🔍 Catalog has {len(catalog_set)} unique names")

# ------------------------------------------------------------
# 5) Update flag column in-place
# ------------------------------------------------------------
def flag_match(name, cur):
    if str(cur).strip().lower() == "yes":
        return "yes"
    if pd.isna(name):
        return cur
    return "yes" if str(name).strip().lower() in catalog_set else cur

full_df[FULL_FLAG_COL] = [
    flag_match(n, f) for n, f in zip(full_df[FULL_REST_COL], full_df[FULL_FLAG_COL])
]

print("\n🏷 Flag counts after update:")
print(full_df[FULL_FLAG_COL].value_counts(dropna=False))

# ------------------------------------------------------------
# 6) Save & download
# ------------------------------------------------------------
out_name = os.path.splitext(full_name)[0] + "_1960flag.csv"
full_df.to_csv(out_name, index=False)
files.download(out_name)
print(f"\n📥 Downloaded: {out_name}")
