In [1]:

import os, time, pandas as pd, numpy as np, re

def snake_case(name: str) -> str:
    name = name.strip()
    name = re.sub(r"[^\w]+", "_", name)
    name = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", name)
    name = re.sub(r"_+", "_", name)
    return name.lower().strip("_")

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [snake_case(c) for c in df.columns]
    return df

def strip_strings(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if df[c].dtype == "object":
            df[c] = df[c].astype(str).str.strip().replace({"": np.nan, "None": np.nan, "NULL": np.nan})
    return df

def infer_types(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        lc = c.lower()
        if any(k in lc for k in ["date", "dt", "fecha"]):
            df[c] = pd.to_datetime(df[c], errors="coerce"); continue
        if "id" in lc:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64"); continue
        if df[c].dtype == "object":
            num = pd.to_numeric(df[c], errors="coerce")
            if num.notna().mean() > 0.6:
                df[c] = num
    return df

def clean_basic(df: pd.DataFrame) -> pd.DataFrame:
    df = standardize_columns(df)
    df = df.dropna(how="all")
    df = strip_strings(df)
    df = infer_types(df)
    return df

start = time.time()
print("Start: RAW -> CLEAN")

cwd = os.getcwd()
raw_files = [f for f in os.listdir(cwd) if f.endswith("_raw.csv")]
if not raw_files:
    print("No _raw.csv files found in current directory.")
else:
    for fname in raw_files:
        path = os.path.join(cwd, fname)
        try:
            df = pd.read_csv(path, low_memory=False)
        except Exception as e:
            print(f"Error reading {fname}: {e}")
            continue
        cleaned = clean_basic(df)
        out_name = fname.replace("_raw.csv", "_clean.csv")
        out_path = os.path.join(cwd, out_name)
        cleaned.to_csv(out_path, index=False, encoding="utf-8-sig")
        print(f"Wrote {out_name} with shape {cleaned.shape}")

end = time.time()
print(f"Completed RAW -> CLEAN in {end - start:.2f} seconds")

Start: RAW -> CLEAN
Wrote common_player_info_clean.csv with shape (4171, 33)
Wrote game_clean.csv with shape (65698, 55)
Wrote game_summary_clean.csv with shape (58110, 14)
Wrote other_stats_clean.csv with shape (28271, 26)
Wrote player_clean.csv with shape (4831, 5)
Wrote team_clean.csv with shape (30, 7)
Completed RAW -> CLEAN in 5.57 seconds
