# Cleaning Using Polars

In [None]:
import os
import glob
import warnings
import polars as pl

# ─── Suppress noisy warnings ──────────────────────────────────────────────────
warnings.filterwarnings("ignore")

# ─── Raw regex strings ────────────────────────────────────────────────────────
HTML_RX     = r"<.*?>"
CLEAN_RX    = r"[^a-zA-Z0-9\s]"
WS_RX       = r"\s+"
TRIM_EDGES  = r"^\s+|\s+$"

# ─── Polars expression for cleaning a single column ───────────────────────────
def clean_expr(col_name: str) -> pl.Expr:
    return (
        pl.col(col_name).cast(str)
          .str.replace_all(HTML_RX, "")
          .str.replace_all(CLEAN_RX, "")
          .str.replace_all(WS_RX, " ")
          .str.replace_all(TRIM_EDGES, "")
    )

def main():
    data_dir    = "../../parquet_output_2_extras_with_names"
    output_dir  = "./cleaned_data_polars"
    html_fields = [
        "detailed_description",
        "about_the_game",
        "short_description",
        "review",
    ]
    os.makedirs(output_dir, exist_ok=True)

    files = glob.glob(os.path.join(data_dir, "*.parquet"))
    total = len(files)
    print(f"Found {total} files to clean")

    cleaned = 0
    for path in files:
        fname = os.path.basename(path)

        # Read, clean, and write
        df = pl.read_parquet(path)
        exprs = [clean_expr(col) for col in html_fields if col in df.columns]
        if exprs:
            df = df.with_columns(exprs)
        df.write_parquet(os.path.join(output_dir, fname), compression="snappy")

        # Update and overwrite the progress line
        cleaned += 1
        print(f"\rCleaned {cleaned}/{total} files", end="", flush=True)

    print("\n✅ All files cleaned and saved to", output_dir)

if __name__ == "__main__":
    main()
