# Data Exploration - Steam Games Dataset

## Import Libraries and Setup


In [None]:
import sys

sys.path.append("../src")

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from data_preprocessing import base_pipeline, final_cleaning_pipeline, scaling_pipeline


pd.set_option("display.max_columns", None)

## Pipeline-Based Preprocessing


In [None]:
base_pipeline.set_params(data_loading__filepath="../data/raw/games.csv")

pre_outlier_df = base_pipeline.fit_transform(None)
pre_scaling_df = final_cleaning_pipeline.fit_transform(pre_outlier_df)

print(f"Pre-scaling Dataset shape: {pre_scaling_df.shape}")
print(f"Columns after preprocessing: {list(pre_scaling_df.columns)}")
pre_scaling_df.info()

# Change scaler in data_preprocessing.py to use different scalers (PowerTransformerScaler, QuantileTransformerScaler, RobustTransformerScaler)
df = scaling_pipeline.fit_transform(pre_scaling_df)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.info()

## Dataset Overview


In [None]:
pre_scaling_df.head().to_csv(
    "../data/processed/pre_scaling_dataset_head.csv", index=False
)

pre_scaling_df.head()

In [None]:
pre_scaling_df.describe()

In [None]:
df.head().to_csv("../data/processed/cleaned_dataset_head.csv", index=False)

df.head()

In [None]:
df.describe()

## Specific Game Analysis


In [None]:
specific_game_by_id = pre_outlier_df.loc[pre_outlier_df["appid"] == 271590]

print(specific_game_by_id)

In [None]:
specific_game_by_name = pre_outlier_df[
    pre_outlier_df["name"] == "Shadow of the Tomb Raider: Definitive Edition"
]

print(
    specific_game_by_name[["name", "appid", "developers", "publishers", "genres_tags"]]
)

## Specific Column Analysis


In [None]:
pre_outlier_df["weekday"].value_counts()

## Data Correlation Analysis

### Correlation Matrix


In [None]:
encoded_feature_prefixes = [
    "weekday_",
    "review_category_",
    "genres_tags_",
    "categories_",
    "supported_languages_",
    "full_audio_languages_",
    "developer_tier_"
]
encoded_cols = [
    col
    for col in df.columns
    if any(col.startswith(prefix) for prefix in encoded_feature_prefixes)
]

numeric_columns = df.select_dtypes(
    include=["int64", "float64", "datetime64[ns]"]
).columns
filtered_numeric_columns = [col for col in numeric_columns if col not in encoded_cols]

correlation_matrix = df[filtered_numeric_columns].corr()

fig_heatmap = go.Figure(
    data=go.Heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns,
        y=correlation_matrix.columns,
        colorscale="RdBu",
        zmid=0,
        text=correlation_matrix.round(2).values,
        texttemplate="%{text}",
        textfont={"size": 8},
        hoverongaps=False,
    )
)

fig_heatmap.update_layout(
    title="Correlation Matrix of Numeric Variables (Excluding Encoded)",
    width=1200,
    height=1000,
    xaxis={"side": "bottom"},
    yaxis={"autorange": "reversed"},
)

fig_heatmap.show()

### Categorical Feature Correlation Analysis


In [None]:
weekday_cols = [col for col in df.columns if col.startswith("weekday_")]
dev_tier_cols = [col for col in df.columns if col.startswith("developer_tier_")]
categories_cols = [col for col in df.columns if col.startswith("categories_")]
genres_tags_cols = [col for col in df.columns if col.startswith("genres_tags_")]
supported_languages_cols = [
    col for col in df.columns if col.startswith("supported_languages_")
]
full_audio_languages_cols = [
    col for col in df.columns if col.startswith("full_audio_languages_")
]

key_metrics = [
    "price",
    "metacritic_score",
    "average_playtime_forever",
    "median_playtime_forever",
    "estimated_owners_calculated",
]

categorical_groups = {
    "Weekday": weekday_cols,
    "Categories": categories_cols,
    "Genres & Tags": genres_tags_cols,
    "Supported Languages": supported_languages_cols,
    "Full Audio Languages": full_audio_languages_cols,
    "Developer Tier": dev_tier_cols,
}

for group_name, cols in categorical_groups.items():
    if len(cols) > 0:
        categorical_correlations = df[cols + key_metrics].corr()
        cat_metric_corr = categorical_correlations.loc[cols, key_metrics]

        fig_categorical = go.Figure(
            data=go.Heatmap(
                z=cat_metric_corr.values,
                x=cat_metric_corr.columns,
                y=cat_metric_corr.index,
                colorscale="RdBu",
                zmid=0,
                text=cat_metric_corr.round(3).values,
                texttemplate="%{text}",
                textfont={"size": 10},
                hoverongaps=False,
            )
        )

        fig_categorical.update_layout(
            title=f"Correlations: {group_name} vs Key Metrics",
            width=1000,
            height=max(400, len(cols) * 30),
            xaxis={"side": "bottom", "title": "Key Metrics"},
            yaxis={"title": f"{group_name} Features"},
        )

        fig_categorical.show()
        print(f"\n{group_name} - Number of features: {len(cols)}")

## Data Visualization

### Key Metrics Analysis


In [None]:
encoded_feature_prefixes = [
    "weekday_",
    "review_category_",
    "genres_tags_",
    "categories_",
    "supported_languages_",
    "full_audio_languages_",
    "developer_tier_"
]
encoded_cols = [
    col
    for col in pre_scaling_df.columns
    if any(col.startswith(prefix) for prefix in encoded_feature_prefixes)
]

numeric_columns = pre_scaling_df.select_dtypes(
    include=["int64", "float64", "datetime64[ns]"]
).columns
filtered_numeric_columns = [col for col in numeric_columns if col not in encoded_cols]

for col in filtered_numeric_columns:
    fig = px.histogram(
        pre_scaling_df, x=col, nbins=50, title=f"Distribution of {col}", text_auto=True
    )
    fig.show()

## Overview of Games with NaN Values


In [None]:
nan_games = pre_outlier_df[pre_outlier_df["appid"].isna()]
print(f"Number of games with NaN values: {len(nan_games)}")
print(nan_games[["name", "appid", "developers", "publishers"]].head(10))