# Scaler Comparison - Steam Games Dataset

1. PowerTransformer with Yeo-Johnson
2. QuantileTransformer with uniform distribution
3. RobustScaler (median and IQR based)

In [None]:
import sys
sys.path.append("../src")

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from data_preprocessing import (
    base_pipeline,
    final_cleaning_pipeline,
    power_scaler,
    quantile_scaler,
    robust_scaler,
)

pd.set_option("display.max_columns", None)

## Data Preparation

In [None]:
base_pipeline.set_params(data_loading__filepath="../data/raw/games.csv")
pre_outlier_df = base_pipeline.fit_transform(None)
base_df = final_cleaning_pipeline.fit_transform(pre_outlier_df)

print(f"Base dataset shape: {base_df.shape}")

numeric_columns = base_df.select_dtypes(include=[np.number]).columns
print(f"Numeric columns: {list(numeric_columns)}")

## Apply Different Scalers

In [None]:
df_power = power_scaler.fit_transform(base_df.copy())
df_quantile = quantile_scaler.fit_transform(base_df.copy())
df_robust = robust_scaler.fit_transform(base_df.copy())

scalers_data = {
    'PowerTransformer (Yeo-Johnson)': df_power,
    'QuantileTransformer (Uniform)': df_quantile,
    'RobustScaler': df_robust
}

print("All scalers applied successfully!")

## Statistical Comparison

In [None]:
comparison_stats = {}

for scaler_name, df_scaled in scalers_data.items():
    stats = df_scaled[numeric_columns].describe()
    comparison_stats[scaler_name] = stats
    
    print(f"\n=== {scaler_name} Statistics ===")
    print(f"Mean range: [{stats.loc['mean'].min():.3f}, {stats.loc['mean'].max():.3f}]")
    print(f"Std range: [{stats.loc['std'].min():.3f}, {stats.loc['std'].max():.3f}]")
    print(f"Min range: [{stats.loc['min'].min():.3f}, {stats.loc['min'].max():.3f}]")
    print(f"Max range: [{stats.loc['max'].min():.3f}, {stats.loc['max'].max():.3f}]")

## Distribution Comparison

### Select Key Columns for Detailed Analysis

In [None]:
key_columns = [
    "price",
    "average_playtime_forever",
    "estimated_owners_calculated",
]

for col in key_columns:
    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=[
            "Original",
            "PowerTransformer (Yeo-Johnson)",
            "RobustScaler",
            "QuantileTransformer (Uniform)",
        ],
        vertical_spacing=0.1,
    )

    fig.add_trace(
        go.Histogram(x=base_df[col], name="Original", nbinsx=50, opacity=0.7),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Histogram(x=df_power[col], name="Power", nbinsx=50, opacity=0.7),
        row=1,
        col=2,
    )

    fig.add_trace(
        go.Histogram(x=df_robust[col], name="Robust", nbinsx=50, opacity=0.5),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Histogram(x=df_quantile[col], name="Quantile", nbinsx=50, opacity=0.7),
        row=2,
        col=2,
    )

    fig.update_layout(
        title=f"Distribution Comparison: {col}", height=600, showlegend=True
    )

    fig.show()

## Correlation Matrix Comparison

In [None]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=list(scalers_data.keys()),
    specs=[[{"type": "heatmap"}, {"type": "heatmap"}],
           [{"type": "heatmap"}, {"type": "heatmap"}]]
)

positions = [(1,1), (1,2), (2,1), (2,2)]

for i, (scaler_name, df_scaled) in enumerate(scalers_data.items()):
    corr_matrix = df_scaled[numeric_columns].corr()
    row, col = positions[i]
    
    fig.add_trace(
        go.Heatmap(
            z=corr_matrix.values,
            x=corr_matrix.columns,
            y=corr_matrix.columns,
            colorscale="RdBu",
            zmid=0,
            showscale=(i==0),
            name=scaler_name
        ),
        row=row, col=col
    )

fig.update_layout(
    title="Correlation Matrix Comparison Across Scalers",
    height=1000,
    width=1200
)

fig.show()

## Outlier Sensitivity Analysis

In [None]:
selected_cols = [
    "price",
    "average_playtime_forever",
    "estimated_owners_calculated",
]

for col in selected_cols:
    fig = go.Figure()

    for scaler_name, df_scaled in scalers_data.items():
        fig.add_trace(go.Box(y=df_scaled[col], name=scaler_name, boxpoints="outliers"))

    fig.update_layout(
        title=f"Outlier Comparison: {col}", yaxis_title="Scaled Values", height=500
    )

    fig.show()