### This notebook contains code to generate and save various plots related to Steam games data. It takes roughly 5 minutes to run.

## Imports

In [None]:
from __future__ import annotations
import sys

sys.path.append("../src")

import pandas as pd
import numpy as np
import os
import plotly.graph_objects as go
import plotly.io as pio
import traceback
import plotly.express as px
from metadata_options import (
    get_genres_tags_clusters,
)
from typing import Optional
from scaler_comparison import scaler_comparison
from data_preprocessing import base_pipeline, plotting_pipeline, indie_filter_pipeline
from plots import (
    plot_genre_combination_comparison,
    get_layout_width,
    get_layout_height,
    aggregate_users_by_language,
    drop_language_columns_by_user_count,
)

## Read Raw Data And Pipeline It

In [None]:
df_base_pipielined = base_pipeline.fit_transform(None)
df_plotting_pipelined = plotting_pipeline.fit_transform(df_base_pipielined)
df_indie_games = indie_filter_pipeline.fit_transform(df_plotting_pipelined)

## Load User Count Data

In [None]:
df_steam_users_by_country_2025_with_languages = pd.read_json("../data/processed/steam-users-by-country-2021-with-languages.json")
dic = aggregate_users_by_language(df_steam_users_by_country_2025_with_languages)

In [None]:
df_indie_games_low_cols = drop_language_columns_by_user_count(df_indie_games, dic, 1000000)

## Save Figures

In [None]:
def save_plotly_figures_from_dic(
    figures,
    base_filename="figure",
    output_dir="../data/processed",
    default_width=700,
    default_height=500,
    scale=1,
):
    try:
        os.makedirs(output_dir, exist_ok=True)
    except Exception as e:
        print(f"Fehler beim Erstellen des Ausgabeordners '{output_dir}': {e}")
        traceback.print_exc()
        return

    for idx, (key, fig) in enumerate(figures.items(), start=1):
        try:
            if isinstance(fig, str):
                fig = pio.from_json(fig)

            width = fig.layout.width if fig.layout.width is not None else default_width
            height = (
                fig.layout.height if fig.layout.height is not None else default_height
            )

            safe_key = str(key).replace("/", "-").replace("\\", "-").replace(" ", "_")
            filename = os.path.join(output_dir, f"{base_filename}_{safe_key}.png")

            fig.write_image(filename, width=width, height=height, scale=scale)

            print(f"Gespeichert: {filename} ({width}×{height})")

        except Exception as e:
            print(f"Fehler beim Speichern der Abbildung {idx} (Key: '{key}'): {e}")
            traceback.print_exc()

In [None]:
def save_plotly_figures_png(
    figures,
    base_filename="figure",
    output_dir="../data/processed",
    default_width=700,
    default_height=500,
    scale=1
):
    try:
        os.makedirs(output_dir, exist_ok=True)
    except Exception as e:
        print(f"Fehler beim Erstellen des Ausgabeordners '{output_dir}': {e}")
        traceback.print_exc()
        return

    for idx, fig in enumerate(figures, start=1):
        try:
            width = fig.layout.width if fig.layout.width is not None else default_width
            height = fig.layout.height if fig.layout.height is not None else default_height

            filename = os.path.join(output_dir, f"{base_filename}.png")

            fig.write_image(filename, width=width, height=height, scale=scale)

            print(f"Gespeichert: {filename} ({width}×{height})")

        except Exception as e:
            print(f"Fehler beim Speichern der Abbildung {idx}: {e}")
            traceback.print_exc()

In [None]:
def save_plotly_figures_png_set_layout(
    figures,
    base_filename="figure",
    output_dir="../data/processed",
    default_width=700,
    default_height=500,
    scale=1
):
    try:
        os.makedirs(output_dir, exist_ok=True)
    except Exception as e:
        print(f"Fehler beim Erstellen des Ausgabeordners '{output_dir}': {e}")
        traceback.print_exc()
        return

    for idx, fig in enumerate(figures, start=1):
        try:
            width  = default_width
            height = default_height

            filename = os.path.join(output_dir, f"{base_filename}.png")

            fig.write_image(filename, width=width, height=height, scale=scale)

            print(f"Gespeichert: {filename} ({width}×{height})")

        except Exception as e:
            print(f"Fehler beim Speichern der Abbildung {idx}: {e}")
            traceback.print_exc()

## Visual reports

In [None]:
def create_heatmap_cluster_lang_target_agg(
    df,
    cluster_key= "Strategy_Management",
    target_column="pct_pos_total",
    agg_func="median",
    language_mode="audio"
):
    
    clusters = get_genres_tags_clusters()

    if cluster_key not in clusters:
        raise ValueError(f"Cluster '{cluster_key}' not found in clusters.")

    if language_mode == "audio":
        language_prefix = "full_audio_languages_"
    elif language_mode == "text":
        language_prefix = "supported_languages_"
    else:
        raise ValueError("language_mode must be 'full' or 'text'")

    language_cols = [col for col in df.columns if col.startswith(language_prefix)]
    if not language_cols:
        raise ValueError(f"No columns found with prefix '{language_prefix}'")

    languages = [col[len(language_prefix):] for col in language_cols]
    languages_lower = [lang.lower() for lang in languages]

    tags = clusters[cluster_key]
    tags_lower = [tag.lower() for tag in tags]

    z = []
    text_annotations = []

    for lang, lang_display in zip(languages_lower, languages):
        row = []
        text_row = []
        for tag, tag_display in zip(tags_lower, tags):
            genre_cols = [c for c in df.columns if c.lower() == f"genres_tags_{tag}"]
            lang_col = f"{language_prefix}{lang}"

            if not genre_cols or lang_col not in df.columns:
                row.append(np.nan)
                text_row.append(f"{tag_display} / {lang_display}<br>No data")
                continue

            genre_col = genre_cols[0]

            mask = (df[genre_col] == 1) & (df[lang_col] == 1)
            subset = df.loc[mask, target_column]

            count = len(subset)

            if count:
                value = subset.mean() if agg_func == "mean" else subset.median()
            else:
                value = np.nan

            row.append(value)
            text_row.append(
                f"Genre Tag: {tag_display}<br>Language: {lang_display}<br>{count} games"
            )

        z.append(row)
        text_annotations.append(text_row)

    fig = go.Figure(data=go.Heatmap(
        z=z,
        x=tags,
        y=languages,
        text=text_annotations,
        hovertemplate="%{text}<br>Value: %{z}<extra></extra>",
        colorbar=dict(title=f"{'Mean' if agg_func=='mean' else 'Median'} {target_column}"),
        colorscale="Viridis"
    ))

    fig.update_layout(
        title=f"Heatmap-{target_column}-{agg_func}-{language_mode}-{cluster_key}",
        xaxis_title="Genre Tags",
        yaxis_title="Audio Languages" if language_mode=="audio" else "Text Languages",
        height = get_layout_height(len(languages)),
        width = get_layout_width(len(tags))
    )
    return fig

In [None]:
def plot_target_combinations_heatmap(
    df: pd.DataFrame,
    prefix_x: str,
    prefix_y: str,
    target: str = 'pct_pos_total',
    aggfunc: str = 'mean',
    threshold: Optional[float] = None,
    min_count: Optional[int] = None,
):
    cols_x = [col for col in df.columns if col.startswith(prefix_x)]
    cols_y = [col for col in df.columns if col.startswith(prefix_y)]

    if not cols_x or not cols_y:
        raise ValueError("Präfixe führen zu keiner oder nur einer Feature-Gruppe.")

    x_labels = [col.replace(prefix_x, '') for col in cols_x]
    y_labels = [col.replace(prefix_y, '') for col in cols_y]

    matrix = pd.DataFrame(index=y_labels, columns=x_labels, dtype=float)
    count_matrix = pd.DataFrame(index=y_labels, columns=x_labels, dtype=int)

    for x_full, x in zip(cols_x, x_labels):
        for y_full, y in zip(cols_y, y_labels):
            subset = df[(df[x_full] == 1) & (df[y_full] == 1)]
            count_matrix.at[y, x] = len(subset)

            if subset.empty:
                matrix.at[y, x] = np.nan
            else:
                if aggfunc == 'mean':
                    matrix.at[y, x] = subset[target].mean()
                elif aggfunc == 'median':
                    matrix.at[y, x] = subset[target].median()
                elif aggfunc == 'sum':
                    matrix.at[y, x] = subset[target].sum()
                else:
                    raise ValueError(f"Aggregation '{aggfunc}' nicht unterstützt.")

    if threshold is not None or min_count is not None:
        threshold_mask = matrix >= threshold if threshold is not None else pd.DataFrame(True, index=matrix.index, columns=matrix.columns)
        count_mask = count_matrix >= min_count if min_count is not None else pd.DataFrame(True, index=matrix.index, columns=matrix.columns)

        combined_mask = threshold_mask & count_mask

        matrix = matrix.where(combined_mask)
        count_matrix = count_matrix.where(combined_mask)

        matrix = matrix.dropna(axis=0, how="all").dropna(axis=1, how="all")
        count_matrix = count_matrix.reindex_like(matrix)

        if matrix.empty:
            print("Keine Kombinationen erfüllen Threshold und Mindestanzahl.")
            return

    title = (
        f"{aggfunc} von '{target}' für Kombinationen von <br>"
        f"{prefix_x.rstrip('_')} und {prefix_y.rstrip('_')}<br>"
    )
    if threshold is not None:
        title += f"(nur Werte ≥ {threshold}) "
    if min_count is not None:
        title += f"(mind. {min_count} Spiele)"

    custom_data = np.expand_dims(count_matrix.values, axis=-1)

    fig = px.imshow(
        matrix,
        text_auto=".0f",
        aspect='auto',
        title=title,
        labels={'x': prefix_x.rstrip('_'), 'y': prefix_y.rstrip('_'), 'color': aggfunc.capitalize()}
    )

    fig.update_traces(
        customdata=custom_data,
        hovertemplate=
            "FeatureY: %{y}<br>" +
            "FeatureX: %{x}<br>" +
            f"{aggfunc.capitalize()}: %{{z:.2f}}<br>" +
            "Spiele: %{customdata[0]}"
    )

    fig.update_layout(
        xaxis_tickangle=-45,
        width=get_layout_width(len(matrix.columns)),
        height=get_layout_height(len(matrix.index))
    )

    return fig

In [None]:
def plot_language_scatter(df_users, df_audio_support, mode):
    all_languages = set()
    for langs in df_users['spokenLanguages']:
        all_languages.update(langs)

    language_to_players = {}
    for lang in all_languages:
        mask = df_users['spokenLanguages'].apply(lambda l: lang in l)
        total_players = df_users.loc[mask, 'SteamUsersTotal2021'].sum()
        language_to_players[lang] = total_players

    language_to_games = {}
    for lang in all_languages:
        if mode == "audio":
            col_name = f"full_audio_languages_{lang.lower()}"
        elif mode == "text":
            col_name = f"supported_languages_{lang.lower()}"
        else:
            raise ValueError("mode must be 'audio' or 'text'")

        if col_name in df_audio_support.columns:
            total_games = df_audio_support[col_name].sum()
            language_to_games[lang] = int(total_games)
        else:
            language_to_games[lang] = 0

    df_plot = pd.DataFrame({
        "Language": sorted(all_languages),
        "SteamPlayers": [language_to_players[lang] for lang in sorted(all_languages)],
        "Games": [language_to_games[lang] for lang in sorted(all_languages)]
    })

    df_plot = df_plot[(df_plot["SteamPlayers"] > 0) & (df_plot["Games"] > 0)]

    df_plot["LabelText"] = df_plot.apply(
        lambda row: f"{row['Language']}<br>{int(row['SteamPlayers'])}/{int(row['Games'])}",
        axis=1
    )
    fig = px.scatter(
        df_plot,
        x="SteamPlayers",
        y="Games",
        text="LabelText",
        size="Games",
        hover_data=["Language", "SteamPlayers", "Games"],
        title=("Sprachen: Spieler vs. Spiele mit vollem Audio-Support" if mode == "audio" else "Sprachen: Spieler vs. Spiele mit vollem Text-Support")
    )

    fig.update_traces(textposition="top center")

    if mode == "audio":
        fig.update_layout(
            xaxis_title="Anzahl Steam-Spieler (logarithmisch)",
            yaxis_title="Anzahl Spiele mit Audio-Support (logarithmisch)",
            xaxis=dict(
                type="log",
                range=[6, 7.55],  # log10(1M) to log10(35M)
                tickvals=[1e6, 1e7, 3e7, 3.5e7],
                ticktext=["1M", "10M", "30M", "35M"]
            ),  

            yaxis=dict(
                type="log",
                range=[0, 4.48],  # log10(1) to log10(30,000)
                tickvals=[1, 10, 100, 1000, 10000, 30000],
                ticktext=["1", "10", "100", "1K", "10K", "30K"]
            )
        )
    elif mode == "text":
        fig.update_layout(
            xaxis_title="Anzahl Steam-Spieler (logarithmisch)",
            yaxis_title="Anzahl Spiele mit Text-Support (logarithmisch)",
            xaxis=dict(
                type="log",
                range=[6, 7.55],  # log10(1M) to log10(35M)
                tickvals=[1e6, 1e7, 3e7, 3.5e7],
                ticktext=["1M", "10M", "30M", "35M"]
            ),
            yaxis=dict(
                type="log",
                range=[2, 5],  # log10(100) to log10(100,000)
                tickvals=[100, 1000, 10000, 100000],
                ticktext=["100", "1K", "10K", "100K"]
            )
        )

    return fig

## Create Plots And Save Them

In [None]:
modes = ["text", "audio"]
output_dir = "../data/processed/output_plots/language_scatter"

for mode in modes:
    figure = plot_language_scatter(
        df_users=df_steam_users_by_country_2025_with_languages,
        df_audio_support=df_indie_games_low_cols,
        mode=mode
    )

    save_plotly_figures_png_set_layout(
        [figure],
        base_filename=f"plot_language_scatter_{mode}",
        output_dir=output_dir,
        default_width=1500,
        default_height=1500,
    )

In [None]:
figures_combinations = []
top_n_rows_1_values = [1000, 50]
comb_sizes = [1, 2, 3]

ignore = ['indie', 'kickstarter', 'crowdfunding']

for top_n in top_n_rows_1_values:
    for comb_size in comb_sizes:
        fig = plot_genre_combination_comparison(
            df=df_indie_games_low_cols,
            top_n_rows=top_n,
            comb_size=comb_size,
            top_n_combinations=10,
            ignore=ignore
        )
        base_filename = f"genre_combination_comparison_top_{top_n}_comb_size_{comb_size}"
        figures_combinations.append(fig)
        save_plotly_figures_png(
            figures = [fig],
            base_filename= base_filename,
            output_dir="../data/processed/output_plots/genres_combinations"
        )

In [None]:
cluster_dict = get_genres_tags_clusters()

target_columns = ["pct_pos_total", "estimated_owners_calculated", "average_playtime_forever", "median_playtime_forever"]
agg_funcs = ["mean", "median"]
language_modes = ["audio", "text"]
cluster_key = cluster_keys = list(cluster_dict.keys())

figures_cluster_heatmaps = []

for target in target_columns:
    for agg in agg_funcs:
        for lang in language_modes:
            for cluster_key in cluster_keys:
                fig = create_heatmap_cluster_lang_target_agg(
                    df=df_indie_games_low_cols,
                    cluster_key=cluster_key,
                    target_column=target,
                    agg_func=agg,
                    language_mode=lang
                )
                base_filename = f"heatmap-{target}-{agg}-{lang}-{cluster_key}"
                figure = []
                figure.append(fig)
                save_plotly_figures_png(
                    figure,
                    base_filename=base_filename,
                    output_dir="../data/processed/output_plots/genres_tags_clusters",
                )


In [None]:
figures_scaler_comparison = scaler_comparison(df=df_indie_games_low_cols)
save_plotly_figures_from_dic(
    figures_scaler_comparison,
    base_filename="scaler_comparison",
    output_dir="../data/processed/output_plots/scaler_comparison"
)

In [None]:
targets = [
    {
        "target": "estimated_owners_calculated",
        "aggfunc": "mean",
        "threshold": 1000000,
        "min_count": 2,
        "filename": "estimated_owners_calculated_heatmap_genres_tags_categories"
    },
    {
        "target": "pct_pos_total",
        "aggfunc": "median",
        "threshold": 95,
        "min_count": 25,
        "filename": "positive_rating_heatmap_genres_tags_categories"
    },
    {
        "target": "median_playtime_forever",
        "aggfunc": "mean",
        "threshold": 1200,
        "min_count": 5,
        "filename": "median_playtime_forever_heatmap_genres_tags_categories"
    },
    {
        "target": "average_playtime_forever",
        "aggfunc": "mean",
        "threshold": 1200,
        "min_count": 5,
        "filename": "average_playtime_forever_heatmap_genres_tags_categories"
    }
]

output_dir = "../data/processed/output_plots/categories_genres_tags_heatmaps"

for params in targets:
    fig = plot_target_combinations_heatmap(
        df=df_indie_games_low_cols,
        prefix_x="genres_tags_",
        prefix_y="categories_",
        target=params["target"],
        aggfunc=params["aggfunc"],
        threshold=params["threshold"],
        min_count=params["min_count"]
    )

    save_plotly_figures_png(
        figures=[fig],
        base_filename=params["filename"],
        output_dir=output_dir
    )