In [None]:
# =============================================
# 0.  Upload & read (CSV or Excel)
# =============================================
from google.colab import files
import pandas as pd, io, os, re

uploaded  = files.upload()
fname     = next(iter(uploaded))
ext       = os.path.splitext(fname)[1].lower()
raw_bytes = uploaded[fname]

if ext in {".xlsx", ".xls"}:
    df = pd.read_excel(io.BytesIO(raw_bytes))
elif ext == ".csv":
    df = pd.read_csv(io.BytesIO(raw_bytes))
else:
    raise ValueError("Please upload a .csv, .xlsx, or .xls file")

# ------- make sure this matches your header exactly ---------
PATTERN_COL = "pattern"         # ← adjust if your sheet uses another label
# ------------------------------------------------------------

# =============================================
# 1.  Canonical legend + cleaner
# =============================================
LEGEND = {
    "Bird and Flower"        : r"bird.*flower|peacock|pine tree",
    "Dragon and Phoenix"     : r"dragon|phoenix|thunder border|flaming pearl|red dragon",
    "God of Longevity"       : r"longevity",
    "Women / Characters"     : r"women|character",
    "Individualized Logo"    : r"logo",
    "Lustre Floral"          : r"lustr|luster",
    "Custom / Misc Pattern"  : r"custom",
}

def normalize(raw):
    s = str(raw).strip().lower()
    if s in {"", "0", "nan"}:               # blanks ⇒ misc
        return "Custom / Misc Pattern"
    for canon, rx in LEGEND.items():
        if re.search(rx, s):
            return canon
    return "Other"                          # safety net

# =============================================
# 2.  Apply cleaner  & overwrite original col
# =============================================
df[PATTERN_COL] = df[PATTERN_COL].apply(normalize)

# quick sanity-check
print("\n=== Canonical counts ===")
print(df[PATTERN_COL].value_counts())

# =============================================
# 3.  Save cleaned sheet
# =============================================
clean_name = os.path.splitext(fname)[0] + "_clean.csv"
df.to_csv(clean_name, index=False)
files.download(clean_name)
print(f"\nClean file saved & downloaded:  {clean_name}")