In [None]:
# ===============================================================
# 0.  Upload & read sheet (CSV or Excel)
# ===============================================================
from google.colab import files
import pandas as pd, io, os, re, numpy as np

upl      = files.upload()                     # ← choose your file
fname    = next(iter(upl))
ext      = os.path.splitext(fname)[1].lower()
b        = io.BytesIO(upl[fname])

if ext in (".xlsx", ".xls"):
    df = pd.read_excel(b)
elif ext == ".csv":
    df = pd.read_csv(b)
else:
    raise ValueError("Upload a .csv / .xlsx / .xls file")

# ===============================================================
# 1.  Parser helpers
# ===============================================================
TODAY             = pd.Timestamp.utcnow().year
STILL_OPEN_TOKEN  = ""          # put "still open" if you want to keep the text

YEAR4   = re.compile(r"\d{4}")

def decade_bounds(s):           # "195?" → (1950,1959)
    base = int(s[:3]) * 10
    return base, base + 9

def expand_short(a, b):         # "1955"-"60" → 1955-1960
    if len(b) == 2:
        b = a[:2] + b
    return int(a), int(b)

def cell_to_range(val, is_closed_col=False):
    """
    Return (min_year, max_year) from a messy cell.
    NaNs become (np.nan, np.nan).
    """
    if pd.isna(val):
        return np.nan, np.nan

    s = str(val).strip().lower()

    # Excel date‐time like "1942/3/1 00:00"
    dt = pd.to_datetime(s, errors="coerce")
    if pd.notna(dt):
        y = int(dt.year)
        return y, y

    # ---- keywords ------------------------------------------------
    if "still open" in s:
        return np.nan, TODAY if is_closed_col else np.nan
    if "not in" in s and "directory" in s:
        y = int(YEAR4.search(s).group())
        return y + 1, np.nan
    if ("in" in s and "directory" in s) or "by" in s or "before" in s or "≤" in s:
        y = int(YEAR4.search(s).group())
        return y, y
    if "after" in s:
        y = int(YEAR4.search(s).group())
        return y, np.nan
    if "between" in s:
        s = s.replace("between", "")

    if "til at least" in s or "to at least" in s:
        y = int(YEAR4.search(s).group())
        return np.nan, y

    # ---- decade shorthand ----------------------------------------
    m = re.fullmatch(r"(\d{4})s", s)
    if m:
        return decade_bounds(m.group(1)[:3] + "?")

    if re.fullmatch(r"\d{3}[\*\?]", s):
        return decade_bounds(s)

    # ---- short or long ranges ------------------------------------
    m = re.fullmatch(r"(\d{4})[-/](\d{1,2})$", s)
    if m:
        return expand_short(*m.groups())

    m = re.fullmatch(r"(\d{4})[-/](\d{3,4})$", s)
    if m:
        return expand_short(*m.groups())

    m = re.fullmatch(r"(\d{4})[-/](\d{4})$", s)
    if m:
        return int(m.group(1)), int(m.group(2))

    # ---- single or multiple years sprinkled in text --------------
    yrs = [int(y) for y in YEAR4.findall(s)]
    if yrs:
        return min(yrs), max(yrs)

    return np.nan, np.nan

def rng_to_string(lo, hi):
    """Turn two floats into '', '1956', or '1955–1960'."""
    lo_is_nan = pd.isna(lo)
    hi_is_nan = pd.isna(hi)

    if lo_is_nan and hi_is_nan:
        return ""
    if lo_is_nan:
        return str(int(hi))        # e.g. “by 1968” ⇒ “1968”
    if hi_is_nan:
        return str(int(lo))        # e.g. “after 1955” ⇒ “1955”
    if lo == hi:
        return str(int(lo))
    return f"{int(lo)}–{int(hi)}"  # nice en-dash


# ===============================================================
# 2.  Overwrite the four columns in place
# ===============================================================
YEAR_COLS = {
    "year opened":                False,
    "year closed":                True,   # special “still open” handling
    "earliest year for artifact": False,
    "latest year for artifact":   False,
}

for col, is_closed in YEAR_COLS.items():
    if col not in df.columns:
        print(f"⚠️  {col} not found – skipped")
        continue

    cleaned = []
    for val in df[col]:
        lo, hi = cell_to_range(val, is_closed_col=is_closed)
        if is_closed and isinstance(val, str) and "still open" in val.lower() and STILL_OPEN_TOKEN:
            cleaned.append(STILL_OPEN_TOKEN)
        else:
            cleaned.append(rng_to_string(lo, hi))

    df[col] = cleaned

# ===============================================================
# 3.  Quick check & save
# ===============================================================
display(df[[c for c in YEAR_COLS]])   # peek first few rows

out = os.path.splitext(fname)[0] + "_dates_clean.csv"
df.to_csv(out, index=False)
files.download(out)
print("✅ cleaned file downloaded:", out)
