In [None]:
import pandas as pd
import os

# Total Number of entries in all the parquet files using Dask

In [None]:
import dask.dataframe as dd
import os
import glob

# 1) Gather only non-empty .parquet files
base_dir = "cleaned_data_polars"
good_files = []
for root, _, files in os.walk(base_dir):
    for f in files:
        if f.endswith(".parquet"):
            path = os.path.join(root, f)
            if os.path.getsize(path) > 0:
                good_files.append(path.replace("\\", "/"))  # normalize on Windows

if not good_files:
    print("No valid parquet files found.")
    exit(1)

# 2) Create Dask dataframe from parquet files
# The read_parquet function can take a list of files
df = dd.read_parquet(good_files)
# 3) Count rows (compute triggers actual execution)
total_reviews = len(df)  # This returns a standard Python integer
print(f"Total reviews: {total_reviews}")

# Total Number of entries in all the parquet files using Polars

In [None]:
import os, glob, polars as pl

# 1) collect non-empty Parquet files
base_dir = "cleaned_data_polars"
good_files = [
    os.path.join(root, f).replace("\\","/")
    for root,_,files in os.walk(base_dir)
    for f in files
    if f.endswith(".parquet") and os.path.getsize(os.path.join(root, f)) > 0
]
if not good_files:
    print("No valid parquet files found.")
    exit(1)

# 2) eager-read + relaxed concat
dfs = [pl.read_parquet(f) for f in good_files]
df = pl.concat(dfs, how="vertical_relaxed")

# 3) count rows
print(f"Total reviews: {df.height}")

# Cleaning using Dask

In [None]:
import os
import glob
import re
import warnings

import pandas as pd
from dask.distributed import Client, as_completed

# ─── Suppress non-critical warnings ────────────────────────────────────────────
warnings.filterwarnings("ignore")

# ─── Your cleaning funcs ───────────────────────────────────────────────────────
def clean_review(text):
    try:
        return re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    except Exception:
        return ""

def remove_html_tags(text):
    try:
        return re.sub(r'<.*?>', '', str(text))
    except Exception:
        return ""

def clean_whitespace(text):
    try:
        t = str(text)
        t = re.sub(r'\n+', ' ', t)
        t = t.replace('\u3000', ' ')
        return re.sub(r'\s+', ' ', t).strip()
    except Exception:
        return ""

def full_clean(text):
    t = remove_html_tags(text)
    t = clean_review(t)
    return clean_whitespace(t)

# ─── Per-file task ─────────────────────────────────────────────────────────────
def process_file(path, output_dir, fields):
    df = pd.read_parquet(path)
    for col in fields:
        if col in df.columns:
            df[col] = df[col].apply(full_clean)
    out = os.path.join(output_dir, os.path.basename(path))
    df.to_parquet(out, compression="snappy")
    return True

# ─── Main ─────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    data_dir   = "../../parquet_output_2_extras"
    output_dir = "./cleaned_data_dask"
    html_fields = [
        "detailed_description",
        "about_the_game",
        "short_description",
        "review",
    ]
    os.makedirs(output_dir, exist_ok=True)

    files = glob.glob(os.path.join(data_dir, "*.parquet"))
    total = len(files)
    print(f"Found {total} files to clean")

    # 1) Spin up a local cluster (auto sizing)
    client = Client()  

    # 2) Submit all tasks
    futures = [
        client.submit(process_file, f, output_dir, html_fields)
        for f in files
    ]

    # 3) As each future completes, update a single-line counter
    cleaned = 0
    for future in as_completed(futures):
        try:
            future.result()
            cleaned += 1
        except Exception:
            pass
        # \r returns to line start; end='' prevents newline; flush=True forces update
        print(f"\rCleaned {cleaned}/{total} files", end="", flush=True)

    print("\n✅ All done!")  # final newline
    client.close()


# Cleaning Using Polars

In [None]:
import os
import glob
import warnings
import polars as pl

# ─── Suppress noisy warnings ──────────────────────────────────────────────────
warnings.filterwarnings("ignore")

# ─── Raw regex strings ────────────────────────────────────────────────────────
HTML_RX     = r"<.*?>"
CLEAN_RX    = r"[^a-zA-Z0-9\s]"
WS_RX       = r"\s+"
TRIM_EDGES  = r"^\s+|\s+$"

# ─── Polars expression for cleaning a single column ───────────────────────────
def clean_expr(col_name: str) -> pl.Expr:
    return (
        pl.col(col_name).cast(str)
          .str.replace_all(HTML_RX, "")
          .str.replace_all(CLEAN_RX, "")
          .str.replace_all(WS_RX, " ")
          .str.replace_all(TRIM_EDGES, "")
    )

def main():
    data_dir    = "../../parquet_output_2_extras_with_names"
    output_dir  = "./cleaned_data_polars"
    html_fields = [
        "detailed_description",
        "about_the_game",
        "short_description",
        "review",
    ]
    os.makedirs(output_dir, exist_ok=True)

    files = glob.glob(os.path.join(data_dir, "*.parquet"))
    total = len(files)
    print(f"Found {total} files to clean")

    cleaned = 0
    for path in files:
        fname = os.path.basename(path)

        # Read, clean, and write
        df = pl.read_parquet(path)
        exprs = [clean_expr(col) for col in html_fields if col in df.columns]
        if exprs:
            df = df.with_columns(exprs)
        df.write_parquet(os.path.join(output_dir, fname), compression="snappy")

        # Update and overwrite the progress line
        cleaned += 1
        print(f"\rCleaned {cleaned}/{total} files", end="", flush=True)

    print("\n✅ All files cleaned and saved to", output_dir)

if __name__ == "__main__":
    main()


# EDA with dask and polars

In [19]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from dask.diagnostics import ProgressBar

def main():
    """
    Analyzes Steam game review data using Dask for distributed processing.
    Performs three key EDA tasks: review volume analysis, sentiment analysis,
    and playtime distribution analysis.
    """
    # Enable progress bar for better visibility of Dask operations
    ProgressBar().register()
    
    # Load all parquet files in one call using Dask
    print("Reading parquet files...")
    df = dd.read_parquet("cleaned_data_polars/*.parquet")
    
    # Data preparation
    # ---------------
    # Handle missing values in key columns
    df['author_playtime_forever'] = df['author_playtime_forever'].fillna(0)
    # Convert boolean voted_up to integer for aggregation (True=1, False=0)
    df['voted_up_int'] = df['voted_up'].astype('bool').astype('int')
    # Convert playtime from minutes to hours
    df['playtime_hours'] = df['author_playtime_forever'] / 60.0
    
    # Task 1: Volume per game
    # ----------------------
    print("\n=== VOLUME PER GAME ===")
    # Count reviews per game
    game_review_counts = df.groupby('steam_appid').size().reset_index(name='review_count')
    # Compute and sort by review count (descending)
    top_games_by_volume = game_review_counts.compute().sort_values('review_count', ascending=False).head(20)
    print("Top 20 games by review count:")
    print(top_games_by_volume)
    
    # Task 2: Sentiment proxy – votes-up ratio
    # --------------------------------------
    print("\n=== SENTIMENT PROXY - VOTES-UP RATIO ===")
    # Calculate sum of positive reviews and total count per game
    sentiment_agg = df.groupby('steam_appid').agg({
        'voted_up_int': ['sum', 'count']
    }).compute()
    
    # Process results in pandas
    sentiment_agg.columns = ['votes_up_sum', 'review_count']
    sentiment_agg['positive_ratio'] = sentiment_agg['votes_up_sum'] / sentiment_agg['review_count']
    
    # Filter games with at least 100 reviews
    top_sentiment = (
        sentiment_agg[sentiment_agg['review_count'] >= 100]
        .sort_values('positive_ratio', ascending=False)
        .head(20)
        .reset_index()
    )
    
    print("Top 20 games by positive ratio (minimum 100 reviews):")
    print(top_sentiment[['steam_appid', 'review_count', 'positive_ratio']])
    
    # Task 3: Play-time distributions
    # -----------------------------
    print("\n=== PLAY-TIME DISTRIBUTIONS ===")
    
    # Dask requires separate computations for different percentiles
    playtime_stats_tasks = {
        'mean_hours': df.groupby('steam_appid')['playtime_hours'].mean(),
        'median_hours': df.groupby('steam_appid')['playtime_hours'].quantile(0.5),
        'percentile_95_hours': df.groupby('steam_appid')['playtime_hours'].quantile(0.95)
    }
    
    # Execute all Dask tasks and collect results
    playtime_stats_results = {k: v.compute() for k, v in playtime_stats_tasks.items()}
    
    # Combine into a single DataFrame
    playtime_stats_df = pd.DataFrame(playtime_stats_results)
    playtime_stats_df = playtime_stats_df.reset_index()
    
    print("Playtime statistics per game (showing first 20):")
    print(playtime_stats_df.head(20))
    
    # Generate global histogram of playtimes
    # ------------------------------------
    print("Generating playtime histogram...")
    
    # For large datasets, sample to avoid memory issues
    estimated_size = df.shape[0].compute()
    
    if estimated_size > 1_000_000:
        # Use a sampling fraction that gives us at most 1M records
        sample_frac = min(1_000_000 / estimated_size, 1.0)
        print(f"Sampling {sample_frac:.2%} of data for histogram ({estimated_size:,} records)")
        playtime_data = df['playtime_hours'].sample(frac=sample_frac).compute()
    else:
        # For smaller datasets, use all data
        playtime_data = df['playtime_hours'].compute()
    
    # Filter outliers for better visualization (keep playtimes under 1000 hours)
    filtered_playtime = playtime_data[playtime_data < 1000]
    
    # Create histogram with 1-hour bins
    plt.figure(figsize=(12, 8))
    plt.hist(filtered_playtime, bins=np.arange(0, 1000, 1), alpha=0.75)
    plt.title('Distribution of Playtime Hours (excluding outliers > 1000 hours)')
    plt.xlabel('Playtime (hours)')
    plt.ylabel('Number of Reviews')
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Add vertical lines for key statistics
    global_mean = filtered_playtime.mean()
    global_median = filtered_playtime.median()
    
    plt.axvline(global_mean, color='r', linestyle='--', label=f'Mean: {global_mean:.2f} hours')
    plt.axvline(global_median, color='g', linestyle='--', label=f'Median: {global_median:.2f} hours')
    
    plt.legend()
    plt.tight_layout()
    
    # Save the histogram
    plt.savefig('playtime_hist.png', dpi=300)
    print("Histogram saved as 'playtime_hist.png'")

if __name__ == "__main__":
    main()

Reading parquet files...

=== VOLUME PER GAME ===


TypeError: FrameBase.reset_index() got an unexpected keyword argument 'name'