# Cleaning Using Polars

In [1]:
import os
import glob
import warnings
import polars as pl

# ─── Suppress noisy warnings ──────────────────────────────────────────────────
warnings.filterwarnings("ignore")

# ─── Raw regex strings ────────────────────────────────────────────────────────
HTML_RX     = r"<.*?>"
CLEAN_RX    = r"[^a-zA-Z0-9\s]"
WS_RX       = r"\s+"
TRIM_EDGES  = r"^\s+|\s+$"

# ─── Polars expression for cleaning a single column ───────────────────────────
def clean_expr(col_name: str) -> pl.Expr:
    return (
        pl.col(col_name).cast(str)
          .str.replace_all(HTML_RX, "")
          .str.replace_all(CLEAN_RX, "")
          .str.replace_all(WS_RX, " ")
          .str.replace_all(TRIM_EDGES, "")
    )

def main():
    data_dir    = "../../parquet_output_2_extras_with_names"
    output_dir  = "./cleaned_data_polars_2_trial"
    html_fields = [
        "detailed_description",
        "about_the_game",
        "short_description",
        "review",
    ]
    os.makedirs(output_dir, exist_ok=True)

    files = glob.glob(os.path.join(data_dir, "*.parquet"))
    total = len(files)
    print(f"Found {total} files to clean")

    cleaned = 1
    for path in files:
        fname = os.path.basename(path)

        # Read, clean, and write
        df = pl.read_parquet(path)
        exprs = [clean_expr(col) for col in html_fields if col in df.columns]
        if exprs:
            df = df.with_columns(exprs)
        df.write_parquet(os.path.join(output_dir, fname), compression="snappy")

        # Update and overwrite the progress line
        cleaned += 1
        print(f"\rCleaned {cleaned}/{total} files", end="", flush=True)

    print("\n✅ All files cleaned and saved to", output_dir)

if __name__ == "__main__":
    main()


Found 16732 files to clean
Cleaned 16733/16732 files
✅ All files cleaned and saved to ./cleaned_data_polars_2_trial


# Using Dask cuDF

In [None]:
import os
import glob
import warnings
import cudf
import re
import numpy as np
from dask import compute, delayed
import time
from concurrent.futures import ThreadPoolExecutor
import psutil

# ─── Suppress noisy warnings ──────────────────────────────────────────────────
warnings.filterwarnings("ignore")

# ─── Raw regex strings ────────────────────────────────────────────────────────
HTML_RX     = r"<.*?>"
CLEAN_RX    = r"[^a-zA-Z0-9\s]"
WS_RX       = r"\s+"
TRIM_EDGES  = r"^\s+|\s+$"

# ─── Function for cleaning text columns with cuDF ─────────────────────────────
def clean_text_series(series):
    # Convert to string type if not already
    series = series.astype('str')
    
    # Apply regex replacements (all at once to maximize GPU utilization)
    series = series.str.replace(HTML_RX, "", regex=True)
    series = series.str.replace(CLEAN_RX, "", regex=True)
    series = series.str.replace(WS_RX, " ", regex=True)
    series = series.str.replace(TRIM_EDGES, "", regex=True)
    
    return series

def process_file_batch(file_batch, output_dir, html_fields):
    """Process multiple files at once to maximize GPU memory usage"""
    results = []
    
    # Load multiple files into memory at once
    dataframes = []
    filenames = []
    
    for path in file_batch:
        try:
            fname = os.path.basename(path)
            gdf = cudf.read_parquet(path)
            dataframes.append(gdf)
            filenames.append(fname)
        except Exception as e:
            print(f"\nError loading {path}: {str(e)}")
            results.append(False)
    
    # Process all loaded dataframes
    for idx, gdf in enumerate(dataframes):
        try:
            fname = filenames[idx]
            output_path = os.path.join(output_dir, fname)
            
            # Clean the HTML fields that exist
            for col in html_fields:
                if col in gdf.columns:
                    if gdf[col].dtype == np.dtype('O') or gdf[col].dtype == 'string':
                        gdf[col] = clean_text_series(gdf[col])
            
            # Write the cleaned DataFrame
            gdf.to_parquet(output_path, compression="snappy")
            results.append(True)
        except Exception as e:
            print(f"\nError processing {filenames[idx]}: {str(e)}")
            results.append(False)
    
    # Clear references to free GPU memory
    dataframes = None
    
    return results

def main():
    data_dir    = "../../parquet_output_2_extras_with_names"
    output_dir  = "./cleaned_data_cudf"
    html_fields = [
        "detailed_description",
        "about_the_game",
        "short_description",
        "review",
    ]
    os.makedirs(output_dir, exist_ok=True)

    files = glob.glob(os.path.join(data_dir, "*.parquet"))
    total = len(files)
    print(f"Found {total} files to clean")

    # Start with large batch size to maximize GPU utilization
    batch_size = 500
    max_workers = psutil.cpu_count(logical=False)  # Use physical cores
    
    cleaned = 0
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(files), batch_size):
            try:
                batch_files = files[i:i+batch_size]
                
                # Process this batch of files
                results = process_file_batch(batch_files, output_dir, html_fields)
                
                # Update progress with timing information
                successful = sum(1 for r in results if r)
                cleaned += successful
                elapsed = time.time() - start_time
                files_per_second = cleaned / elapsed if elapsed > 0 else 0
                
                print(f"\rCleaned {cleaned}/{total} files ({files_per_second:.1f} files/sec)", end="", flush=True)
                
            except Exception as e:
                # If we run out of memory, reduce batch size and retry
                if "out of memory" in str(e).lower():
                    old_batch_size = batch_size
                    batch_size = max(int(batch_size * 0.7), 10)
                    print(f"\nOut of memory with batch size {old_batch_size}. Reducing to {batch_size} and retrying...")
                    
                    # Force memory cleanup
                    import gc
                    gc.collect()
                    
                    # Retry this batch
                    i -= old_batch_size
                else:
                    print(f"\nError processing batch: {str(e)}")
            
            # Force memory cleanup between batches
            import gc
            gc.collect()
    
    total_time = time.time() - start_time
    print(f"\n✅ All files cleaned and saved to {output_dir}")
    print(f"⏱️ Total processing time: {total_time:.2f} seconds ({total/total_time:.1f} files/sec average)")

if __name__ == "__main__":
    main()

Found 16732 files to clean
Cleaned 7500/16732 files (5.5 files/sec)