# Steam Games Model Evaluation and Prediction

## 1. Import Libraries and Setup

In [None]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../src")

import pandas as pd
import numpy as np
import plotly.express as px
import joblib
from sklearn.metrics import r2_score

from data_preprocessing import base_pipeline, final_cleaning_pipeline

pd.set_option("display.max_columns", None)
np.random.seed(42)

print("Libraries loaded successfully!")

## 2. Load Preprocessing Pipelines and Trained Model

In [None]:
def inverse_transform_predictions(scaled_predictions, scaling_pipeline, target_column='estimated_owners_calculated'):
    power_scaler = scaling_pipeline.named_steps['scaling']
    
    target_idx = list(power_scaler.numeric_columns).index(target_column)
    
    n_features = len(power_scaler.numeric_columns)
    dummy_array = np.zeros((len(scaled_predictions), n_features))
    
    dummy_array[:, target_idx] = scaled_predictions
    
    inverse_transformed = power_scaler.scaler.inverse_transform(dummy_array)
    
    original_scale_predictions = inverse_transformed[:, target_idx]
    
    return original_scale_predictions

In [None]:
# Change path to use other models (/models/random_forest, /models/gradient_boosting, /models/xgb_regressor)
models_dir = "../models/xgb_regressor"

model_path = os.path.join(models_dir, "model.joblib")
model = joblib.load(model_path)
print(f"Model loaded from: {model_path}")

feature_columns_path = os.path.join(models_dir, "feature_columns.joblib")
feature_columns = joblib.load(feature_columns_path)
print(f"Feature columns loaded: {len(feature_columns)} features")

scaling_pipeline_path = os.path.join(models_dir, "scaling_pipeline.joblib")
fitted_scaling_pipeline = joblib.load(scaling_pipeline_path)
print(f"Scaling pipeline loaded")

print(f"\nModel details:")
print(f"- Model type: {type(model).__name__}")
print(f"- Features expected: {getattr(model, 'n_features_in_', 'Unknown')}")

if hasattr(model, 'n_estimators'):
    print(f"- Estimators: {model.n_estimators}")
if hasattr(model, 'max_depth'):
    print(f"- Max depth: {model.max_depth}")
if hasattr(model, 'learning_rate'):
    print(f"- Learning rate: {model.learning_rate}")
if hasattr(model, 'alpha'):
    print(f"- Alpha: {model.alpha}")
if hasattr(model, 'C'):
    print(f"- C parameter: {model.C}")
if hasattr(model, 'gamma'):
    print(f"- Gamma: {model.gamma}")
if hasattr(model, 'kernel'):
    print(f"- Kernel: {model.kernel}")

print(f"\nFeature columns: {feature_columns[:20]}...")

## 3. Load and Preprocess Full Dataset

In [None]:
print("Loading and preprocessing data...")

base_pipeline.set_params(data_loading__filepath="../data/raw/games.csv")

pre_outlier_df = base_pipeline.fit_transform(None)
print(f"Base pipeline completed. Shape: {pre_outlier_df.shape}")

pre_scaling_df = final_cleaning_pipeline.fit_transform(pre_outlier_df)
print(f"Final cleaning applied. Shape: {pre_scaling_df.shape}")

print(f"\nDataset overview:")
print(f"- Total games: {len(pre_scaling_df)}")
print(f"- Available columns: {list(pre_scaling_df.columns)}")
print(f"- Owner range: {pre_scaling_df['estimated_owners_calculated'].min():.0f} to {pre_scaling_df['estimated_owners_calculated'].max():.0f}")

## 4. Select Random Sample Games and Predict

In [None]:
n_samples = 10
random_sample_indices = np.random.choice(pre_scaling_df.index, size=n_samples, replace=False)
sample_games = pre_scaling_df.loc[random_sample_indices].copy()

print(f"Selected {n_samples} random games for prediction:")
print(f"Sample indices: {list(random_sample_indices)}")

actual_values_unscaled = sample_games['estimated_owners_calculated'].copy()

sample_scaled = fitted_scaling_pipeline.transform(sample_games)
print(f"Scaling applied. Shape: {sample_scaled.shape}")

actual_values_scaled = sample_scaled['estimated_owners_calculated'].copy()

target_column = "estimated_owners_calculated"
columns_to_ignore = [
    target_column,
    "average_playtime_forever", 
    "median_playtime_forever",
]

numeric_features = sample_scaled.select_dtypes(include=[np.number]).columns
sample_feature_columns = [col for col in numeric_features if col not in columns_to_ignore]

missing_features = set(feature_columns) - set(sample_feature_columns)
extra_features = set(sample_feature_columns) - set(feature_columns)

if missing_features:
    print(f"Missing features in sample: {missing_features}")
if extra_features:
    print(f"Extra features in sample: {extra_features}")

X_sample = sample_scaled[feature_columns].copy()

print(f"Feature selection complete. {X_sample.shape[1]} features used")
print(f"Features match: {list(X_sample.columns) == feature_columns}")

predictions = model.predict(X_sample)
print(f"Predictions completed for {len(predictions)} games")

predictions_unscaled = inverse_transform_predictions(predictions, fitted_scaling_pipeline)
print(f"Predictions converted back to original scale")

sample_results = pd.DataFrame({
    'Game_Index': random_sample_indices,
    'Actual_Unscaled': actual_values_unscaled,
    'Actual_Scaled': actual_values_scaled,
    'Predicted_Scaled': predictions,
    'Predicted_Unscaled': predictions_unscaled,
    'Absolute_Error_Scaled': np.abs(actual_values_scaled - predictions),
    'Absolute_Error_Unscaled': np.abs(actual_values_unscaled - predictions_unscaled),
    'Relative_Error_Percent': (np.abs(actual_values_scaled - predictions) / np.abs(actual_values_scaled)) * 100,
    'Relative_Error_Percent_Unscaled': (np.abs(actual_values_unscaled - predictions_unscaled) / np.abs(actual_values_unscaled)) * 100
})

print(f"\nPrediction quality (Scaled domain):")
print(f"Mean Absolute Error: {sample_results['Absolute_Error_Scaled'].mean():.3f}")
print(f"Mean Relative Error: {sample_results['Relative_Error_Percent'].mean():.1f}%")
print(f"R² Score: {r2_score(actual_values_scaled, predictions):.3f}")

print(f"\nPrediction quality (Original domain):")
print(f"Mean Absolute Error: {sample_results['Absolute_Error_Unscaled'].mean():.0f}")
print(f"Mean Relative Error: {sample_results['Relative_Error_Percent_Unscaled'].mean():.3f}%")
print(f"R² Score: {r2_score(actual_values_unscaled, predictions_unscaled):.3f}")


## 5. Compare Predictions with Actual Estimated Owners

In [None]:
detailed_comparison = pd.DataFrame({
    'Game_Index': sample_results['Game_Index'],
    'Price': sample_games['price'].values,
    'Metacritic_Score': sample_games['metacritic_score'].values,
    'Achievements': sample_games['achievements'].values,
    'Developer_Tier': sample_games['developer_tier'].values,
    'Actual_Owners_Original': sample_results['Actual_Unscaled'].astype(int),
    'Predicted_Owners_Original': sample_results['Predicted_Unscaled'].astype(int),
    'Actual_Owners_Scaled': sample_results['Actual_Scaled'].round(3),
    'Predicted_Owners_Scaled': sample_results['Predicted_Scaled'].round(3),
    'Absolute_Error_Original': sample_results['Absolute_Error_Unscaled'].astype(int),
    'Absolute_Error_Scaled': sample_results['Absolute_Error_Scaled'].round(3),
    'Relative_Error_%_Original': sample_results['Relative_Error_Percent_Unscaled'].round(1),
    'Relative_Error_%_Scaled': sample_results['Relative_Error_Percent'].round(1)
})

print("Detailed prediction comparison:")
display(detailed_comparison)

fig_comparison = px.scatter(
    sample_results,
    x='Actual_Scaled',
    y='Predicted_Scaled',
    title='Predictions vs Actual Values - Random Sample Games (Scaled Domain)',
    labels={'Actual_Scaled': 'Actual Estimated Owners (Scaled)', 'Predicted_Scaled': 'Predicted Estimated Owners (Scaled)'},
    template='plotly_white',
    hover_data=['Game_Index', 'Absolute_Error_Scaled', 'Actual_Unscaled', 'Predicted_Unscaled']
)

min_val = min(sample_results['Actual_Scaled'].min(), sample_results['Predicted_Scaled'].min())
max_val = max(sample_results['Actual_Scaled'].max(), sample_results['Predicted_Scaled'].max())
fig_comparison.add_scatter(
    x=[min_val, max_val],
    y=[min_val, max_val],
    mode='lines',
    name='Perfect Prediction',
    line=dict(color='red', dash='dash')
)

fig_comparison.update_layout(width=700, height=500)
fig_comparison.show()

fig_comparison_original = px.scatter(
    sample_results,
    x='Actual_Unscaled',
    y='Predicted_Unscaled',
    title='Predictions vs Actual Values - Random Sample Games (Original Scale)',
    labels={'Actual_Unscaled': 'Actual Estimated Owners (Original)', 'Predicted_Unscaled': 'Predicted Estimated Owners (Original)'},
    template='plotly_white',
    hover_data=['Game_Index', 'Absolute_Error_Unscaled']
)

min_val_orig = min(sample_results['Actual_Unscaled'].min(), sample_results['Predicted_Unscaled'].min())
max_val_orig = max(sample_results['Actual_Unscaled'].max(), sample_results['Predicted_Unscaled'].max())
fig_comparison_original.add_scatter(
    x=[min_val_orig, max_val_orig],
    y=[min_val_orig, max_val_orig],
    mode='lines',
    name='Perfect Prediction',
    line=dict(color='red', dash='dash')
)

fig_comparison_original.update_layout(width=700, height=500)
fig_comparison_original.show()

fig_unscaled = px.scatter(
    sample_results,
    x='Actual_Unscaled',
    y='Actual_Unscaled', 
    title='Original Unscaled Estimated Owners (Reference)',
    labels={'Actual_Unscaled': 'Original Estimated Owners', 'y': 'Reference Line'},
    template='plotly_white',
    hover_data=['Game_Index']
)

fig_unscaled.update_layout(width=700, height=400)
fig_unscaled.show()

best_prediction_idx = sample_results['Relative_Error_Percent'].idxmin()
worst_prediction_idx = sample_results['Relative_Error_Percent'].idxmax()

print(f"\nBest prediction (lowest relative error):")
print(f"Game Index: {sample_results.loc[best_prediction_idx, 'Game_Index']}")
print(f"Actual (original): {sample_results.loc[best_prediction_idx, 'Actual_Unscaled']:,.0f}")
print(f"Predicted (original): {sample_results.loc[best_prediction_idx, 'Predicted_Unscaled']:,.0f}")
print(f"Actual (scaled): {sample_results.loc[best_prediction_idx, 'Actual_Scaled']:.3f}")
print(f"Predicted (scaled): {sample_results.loc[best_prediction_idx, 'Predicted_Scaled']:.3f}")
print(f"Absolute Error (original): {sample_results.loc[best_prediction_idx, 'Absolute_Error_Unscaled']:,.0f}")
print(f"Relative Error: {sample_results.loc[best_prediction_idx, 'Relative_Error_Percent']:.1f}%")

print(f"\nWorst prediction (highest relative error):")
print(f"Game Index: {sample_results.loc[worst_prediction_idx, 'Game_Index']}")
print(f"Actual (original): {sample_results.loc[worst_prediction_idx, 'Actual_Unscaled']:,.0f}")
print(f"Predicted (original): {sample_results.loc[worst_prediction_idx, 'Predicted_Unscaled']:,.0f}")
print(f"Actual (scaled): {sample_results.loc[worst_prediction_idx, 'Actual_Scaled']:.3f}")
print(f"Predicted (scaled): {sample_results.loc[worst_prediction_idx, 'Predicted_Scaled']:.3f}")
print(f"Absolute Error (original): {sample_results.loc[worst_prediction_idx, 'Absolute_Error_Unscaled']:,.0f}")
print(f"Relative Error: {sample_results.loc[worst_prediction_idx, 'Relative_Error_Percent']:.1f}%")

## 6. Create and Predict a Custom Example Game

In [None]:
print("Creating custom example game...")

custom_game_data = {
    "release_date": pd.Timestamp("2025-06-26"),
    "price": 29.99,
    "dlc_count": 2,
    "windows": True,
    "mac": True,
    "linux": False,
    "metacritic_score": 85,
    "achievements": 50,
    "supported_languages": [
        "english",
        "german",
        "french",
        "spanish - spain",
        "japanese",
        "simplified chinese",
    ],
    "full_audio_languages": ["english", "german", "japanese"],
    "categories": [
        "single-player",
        "steam achievements",
        "steam cloud",
        "full controller support",
        "steam trading cards",
    ],
    "average_playtime_forever": 0,  # Needed for scaling, but not used in prediction
    "median_playtime_forever": 0,  # Needed for scaling, but not used in prediction
    "estimated_owners_calculated": 0,  # Needed for scaling, but not used in prediction
    "screenshot_count": 12,
    "movie_count": 3,
    "genres_tags": [
        "action",
        "adventure",
        "indie",
        "singleplayer",
        "atmospheric",
        "story rich",
        "great soundtrack",
        "beautiful",
        "exploration",
        "puzzle",
    ],
    "description_word_count": 180,
    "platform_count": 2,
    "developer_tier": "indie",
    "weekday": "Thursday",
}

custom_game_df = pd.DataFrame([custom_game_data])

print("Custom game created with these characteristics:")
print(f"  - Release Date: {custom_game_data['release_date'].strftime('%Y-%m-%d')}")
print(f"  - Price: ${custom_game_data['price']}")
print(f"  - Metacritic Score: {custom_game_data['metacritic_score']}")
print(f"  - Achievements: {custom_game_data['achievements']}")
print(f"  - Developer Tier: {custom_game_data['developer_tier']}")
print(
    f"  - Platforms: Windows: {custom_game_data['windows']}, Mac: {custom_game_data['mac']}, Linux: {custom_game_data['linux']}"
)

try:
    custom_game_scaled = fitted_scaling_pipeline.transform(custom_game_df)
    print("Scaling applied")

    X_custom = custom_game_scaled[feature_columns].copy()
    print(f"Features extracted: {X_custom.shape[1]} features")

    custom_prediction = model.predict(X_custom)[0]
    custom_prediction_unscaled = inverse_transform_predictions([custom_prediction], fitted_scaling_pipeline)[0]
    
    print(f"\nPREDICTION RESULT:")
    print(f"Estimated owners for custom game (scaled): {custom_prediction:.3f}")
    print(f"Estimated owners for custom game (original): {custom_prediction_unscaled:,.0f}")

    median_owners = pre_scaling_df["estimated_owners_calculated"].median()
    mean_owners = pre_scaling_df["estimated_owners_calculated"].mean()

    print(f"\nContext comparison:")
    if custom_prediction > median_owners:
        performance = "Above Median"
    else:
        performance = "Below Median"

    percentile = (
        custom_prediction > pre_scaling_df["estimated_owners_calculated"]
    ).mean() * 100
    print(f"- Performance: {performance}")
    print(f"- Better than: {percentile:.1f}% of all games")
    print(f"- Dataset median: {median_owners:.3f} (scaled)")
    print(f"- Dataset mean: {mean_owners:.3f} (scaled)")

except Exception as e:
    print(f"Error during prediction: {str(e)}")
    print("This might be due to feature mismatch or scaling issues.")

print(f"\nFeature analysis for custom game:")
feature_importance = pd.DataFrame(
    {"feature": feature_columns, "importance": model.feature_importances_}
).sort_values("importance", ascending=False)

print("Top 5 most important features:")
for i, (feature, importance) in enumerate(feature_importance.head(5).values):
    if feature in X_custom.columns:
        feature_value = X_custom[feature].iloc[0]
        print(f"{i + 1}. {feature}: {importance:.3f} (value: {feature_value:.3f})")
    else:
        print(f"{i + 1}. {feature}: {importance:.3f} (not available)")