# Steam Games Success Prediction - Model Comparison

- Random Forest
- Gradient Boosting
- XGBoost

## 1. Import Libraries and Setup

In [None]:
import sys

sys.path.append("../src")
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import joblib
import time
from data_preprocessing import base_pipeline, final_cleaning_pipeline, scaling_pipeline

np.random.seed(42)

## 2. Load and Preprocess Data

In [None]:
base_pipeline.set_params(data_loading__filepath="../data/raw/games.csv")

print("Loading and preprocessing data...")
pre_outlier_df = base_pipeline.fit_transform(None)
pre_scaling_df = final_cleaning_pipeline.fit_transform(pre_outlier_df)
df = scaling_pipeline.fit_transform(pre_scaling_df)

print(f"Dataset shape: {df.shape}")
print(f"Available columns: {list(df.columns)}")

## 3. Prepare Features and Target

In [None]:
target_column = "estimated_owners_calculated"
y = df[target_column].copy()

columns_to_ignore = [
    target_column,
    "average_playtime_forever",
    "median_playtime_forever",
]

numeric_features = df.select_dtypes(include=[np.number]).columns
feature_columns = [col for col in numeric_features if col not in columns_to_ignore]

X = df[feature_columns].copy()

# Stratified Train-Test Split (gleicher Split wie in den einzelnen Notebooks)
n_bins = 5
y_binned = pd.cut(y, bins=n_bins, labels=False)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y_binned
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")

## 4. Load Trained Models

In [None]:
print("Loading Random Forest model...")
rf_model = joblib.load("../models/random_forest/model.joblib")
rf_features = joblib.load("../models/random_forest/feature_columns.joblib")
rf_scaler = joblib.load("../models/random_forest/scaling_pipeline.joblib")
print(f"Random Forest loaded successfully. Features: {len(rf_features)}")

print("\nLoading Gradient Boosting model...")
gb_model = joblib.load("../models/gradient_boosting/model.joblib")
gb_features = joblib.load("../models/gradient_boosting/feature_columns.joblib")
gb_scaler = joblib.load("../models/gradient_boosting/scaling_pipeline.joblib")
print(f"Gradient Boosting loaded successfully. Features: {len(gb_features)}")

print("\nLoading XGBoost model...")
xgb_model = joblib.load("../models/xgb_regressor/model.joblib")
xgb_features = joblib.load("../models/xgb_regressor/feature_columns.joblib")
xgb_scaler = joblib.load("../models/xgb_regressor/scaling_pipeline.joblib")
print(f"XGBoost loaded successfully. Features: {len(xgb_features)}")

print("\nAll models loaded successfully!")

## 5. Model Predictions

In [None]:
print("Generating predictions...")

start_time = time.time()
rf_train_pred = rf_model.predict(X_train)
rf_test_pred = rf_model.predict(X_test)
rf_pred_time = time.time() - start_time
print(f"Random Forest prediction time: {rf_pred_time:.3f} seconds")

start_time = time.time()
gb_train_pred = gb_model.predict(X_train)
gb_test_pred = gb_model.predict(X_test)
gb_pred_time = time.time() - start_time
print(f"Gradient Boosting prediction time: {gb_pred_time:.3f} seconds")

start_time = time.time()
xgb_train_pred = xgb_model.predict(X_train)
xgb_test_pred = xgb_model.predict(X_test)
xgb_pred_time = time.time() - start_time
print(f"XGBoost prediction time: {xgb_pred_time:.3f} seconds")

print("\nAll predictions generated successfully!")

## 6. Calculate Performance Metrics

In [None]:
def calculate_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return r2, mae, rmse

models_metrics = {}

rf_train_r2, rf_train_mae, rf_train_rmse = calculate_metrics(y_train, rf_train_pred)
rf_test_r2, rf_test_mae, rf_test_rmse = calculate_metrics(y_test, rf_test_pred)
models_metrics['Random Forest'] = {
    'train_r2': rf_train_r2, 'test_r2': rf_test_r2,
    'train_mae': rf_train_mae, 'test_mae': rf_test_mae,
    'train_rmse': rf_train_rmse, 'test_rmse': rf_test_rmse,
    'pred_time': rf_pred_time
}

gb_train_r2, gb_train_mae, gb_train_rmse = calculate_metrics(y_train, gb_train_pred)
gb_test_r2, gb_test_mae, gb_test_rmse = calculate_metrics(y_test, gb_test_pred)
models_metrics['Gradient Boosting'] = {
    'train_r2': gb_train_r2, 'test_r2': gb_test_r2,
    'train_mae': gb_train_mae, 'test_mae': gb_test_mae,
    'train_rmse': gb_train_rmse, 'test_rmse': gb_test_rmse,
    'pred_time': gb_pred_time
}

xgb_train_r2, xgb_train_mae, xgb_train_rmse = calculate_metrics(y_train, xgb_train_pred)
xgb_test_r2, xgb_test_mae, xgb_test_rmse = calculate_metrics(y_test, xgb_test_pred)
models_metrics['XGBoost'] = {
    'train_r2': xgb_train_r2, 'test_r2': xgb_test_r2,
    'train_mae': xgb_train_mae, 'test_mae': xgb_test_mae,
    'train_rmse': xgb_train_rmse, 'test_rmse': xgb_test_rmse,
    'pred_time': xgb_pred_time
}

comparison_df = pd.DataFrame(models_metrics).T
comparison_df['overfitting'] = comparison_df['train_r2'] - comparison_df['test_r2']

print("Model Performance Comparison:")
print("=" * 80)
print(f"{'Model':<18} {'Train R²':<10} {'Test R²':<10} {'Test MAE':<10} {'Test RMSE':<10} {'Overfitting':<12} {'Pred Time':<12}")
print("-" * 80)

for model_name, metrics in models_metrics.items():
    overfitting = metrics['train_r2'] - metrics['test_r2']
    print(f"{model_name:<18} {metrics['train_r2']:<10.3f} {metrics['test_r2']:<10.3f} {metrics['test_mae']:<10.0f} {metrics['test_rmse']:<10.0f} {overfitting:<12.3f} {metrics['pred_time']:<12.3f}")

print("\n" + "=" * 80)
print("Best performing model by metric:")
print(f"Highest Test R²: {comparison_df['test_r2'].idxmax()} ({comparison_df['test_r2'].max():.3f})")
print(f"Lowest Test MAE: {comparison_df['test_mae'].idxmin()} ({comparison_df['test_mae'].min():.0f})")
print(f"Lowest Test RMSE: {comparison_df['test_rmse'].idxmin()} ({comparison_df['test_rmse'].min():.0f})")
print(f"Lowest Overfitting: {comparison_df['overfitting'].idxmin()} ({comparison_df['overfitting'].min():.3f})")
print(f"Fastest Prediction: {comparison_df['pred_time'].idxmin()} ({comparison_df['pred_time'].min():.3f}s)")

## 7. Performance Metrics Visualization

In [None]:
# 1. R² Score Comparison
models = list(models_metrics.keys())
train_r2_scores = [models_metrics[model]['train_r2'] for model in models]
test_r2_scores = [models_metrics[model]['test_r2'] for model in models]

fig_r2 = go.Figure()
fig_r2.add_trace(go.Bar(
    name='Training R²',
    x=models,
    y=train_r2_scores,
    marker_color='lightblue'
))
fig_r2.add_trace(go.Bar(
    name='Test R²',
    x=models,
    y=test_r2_scores,
    marker_color='darkblue'
))

fig_r2.update_layout(
    title='R² Score Comparison - Training vs Test',
    xaxis_title='Models',
    yaxis_title='R² Score',
    barmode='group',
    template='plotly_white',
    height=500
)
fig_r2.show()

# 2. Error Metrics Comparison
fig_errors = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Mean Absolute Error (MAE)', 'Root Mean Square Error (RMSE)'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}]]
)

test_mae_scores = [models_metrics[model]['test_mae'] for model in models]
test_rmse_scores = [models_metrics[model]['test_rmse'] for model in models]

fig_errors.add_trace(
    go.Bar(x=models, y=test_mae_scores, name='MAE', marker_color='orange'),
    row=1, col=1
)

fig_errors.add_trace(
    go.Bar(x=models, y=test_rmse_scores, name='RMSE', marker_color='red'),
    row=1, col=2
)

fig_errors.update_layout(
    title='Error Metrics Comparison (Test Set)',
    template='plotly_white',
    height=500,
    showlegend=False
)
fig_errors.show()

# 3. Overfitting Analysis
overfitting_scores = [models_metrics[model]['train_r2'] - models_metrics[model]['test_r2'] for model in models]

fig_overfitting = px.bar(
    x=models,
    y=overfitting_scores,
    title='Overfitting Analysis (Train R² - Test R²)',
    labels={'x': 'Models', 'y': 'Overfitting (Train R² - Test R²)'},
    template='plotly_white',
    color=overfitting_scores,
    color_continuous_scale='RdYlBu_r'
)
fig_overfitting.add_hline(y=0, line_dash="dash", line_color="black", annotation_text="No Overfitting")
fig_overfitting.update_layout(height=500, showlegend=False)
fig_overfitting.show()

# 4. Prediction Time Comparison
pred_times = [models_metrics[model]['pred_time'] for model in models]

fig_time = px.bar(
    x=models,
    y=pred_times,
    title='Prediction Time Comparison',
    labels={'x': 'Models', 'y': 'Prediction Time (seconds)'},
    template='plotly_white',
    color=pred_times,
    color_continuous_scale='Viridis'
)
fig_time.update_layout(height=500, showlegend=False)
fig_time.show()

## 8. Predicted vs Actual Comparison

In [None]:
fig_pred_actual = make_subplots(
    rows=1, cols=3,
    subplot_titles=('Random Forest', 'Gradient Boosting', 'XGBoost'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}]]
)

fig_pred_actual.add_trace(
    go.Scatter(
        x=y_test, y=rf_test_pred,
        mode='markers',
        name=f'RF (R²={rf_test_r2:.3f})',
        marker=dict(color='blue', size=4, opacity=0.6)
    ),
    row=1, col=1
)

fig_pred_actual.add_trace(
    go.Scatter(
        x=y_test, y=gb_test_pred,
        mode='markers',
        name=f'GB (R²={gb_test_r2:.3f})',
        marker=dict(color='green', size=4, opacity=0.6)
    ),
    row=1, col=2
)

fig_pred_actual.add_trace(
    go.Scatter(
        x=y_test, y=xgb_test_pred,
        mode='markers',
        name=f'XGB (R²={xgb_test_r2:.3f})',
        marker=dict(color='red', size=4, opacity=0.6)
    ),
    row=1, col=3
)

min_val = y_test.min()
max_val = y_test.max()

for col in [1, 2, 3]:
    fig_pred_actual.add_trace(
        go.Scatter(
            x=[min_val, max_val], y=[min_val, max_val],
            mode='lines',
            name='Perfect Prediction',
            line=dict(color='black', dash='dash'),
            showlegend=(col == 1)
        ),
        row=1, col=col
    )

fig_pred_actual.update_layout(
    title='Predicted vs Actual Values - Model Comparison',
    template='plotly_white',
    height=500,
    width=1200
)

for col in [1, 2, 3]:
    fig_pred_actual.update_xaxes(title_text="Actual Values", row=1, col=col)
    fig_pred_actual.update_yaxes(title_text="Predicted Values", row=1, col=col)

fig_pred_actual.show()

## 9. Feature Importance Comparison

In [None]:
rf_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_,
    'model': 'Random Forest'
})

gb_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': gb_model.feature_importances_,
    'model': 'Gradient Boosting'
})

xgb_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': xgb_model.feature_importances_,
    'model': 'XGBoost'
})

all_importances = pd.concat([rf_importance, gb_importance, xgb_importance])

# Get top 10 features by average importance
avg_importance = all_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)
top_features = avg_importance.head(10).index.tolist()

top_importances = all_importances[all_importances['feature'].isin(top_features)]

fig_importance = px.bar(
    top_importances,
    x='importance',
    y='feature',
    color='model',
    orientation='h',
    title='Top 10 Feature Importances - Model Comparison',
    labels={'importance': 'Feature Importance', 'feature': 'Features'},
    template='plotly_white',
    barmode='group',
    color_discrete_map={
        'Random Forest': 'blue',
        'Gradient Boosting': 'green',
        'XGBoost': 'red'
    }
)

feature_order = avg_importance.head(10).index[::-1]
fig_importance.update_layout(
    height=600,
    yaxis={'categoryorder': 'array', 'categoryarray': feature_order}
)
fig_importance.show()

print("Top 10 Features by Average Importance:")
print("=" * 50)
for i, (feature, avg_imp) in enumerate(avg_importance.head(10).items()):
    rf_imp = rf_importance[rf_importance['feature'] == feature]['importance'].iloc[0]
    gb_imp = gb_importance[gb_importance['feature'] == feature]['importance'].iloc[0]
    xgb_imp = xgb_importance[xgb_importance['feature'] == feature]['importance'].iloc[0]
    
    print(f"{i+1:2d}. {feature:<25}")
    print(f"    Average: {avg_imp:.3f}")
    print(f"    RF: {rf_imp:.3f}, GB: {gb_imp:.3f}, XGB: {xgb_imp:.3f}")
    print()