# Steam Games Success Prediction - XGBRegressor Model

## 1. Import Libraries and Setup


In [None]:
import sys
import os

sys.path.append("../src")
import pandas as pd
import plotly.express as px
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import joblib
import time
from data_preprocessing import base_pipeline, final_cleaning_pipeline, scaling_pipeline

np.random.seed(42)

## 2. Load and Preprocess Data


In [None]:
base_pipeline.set_params(data_loading__filepath="../data/raw/games.csv")

print("Loading and preprocessing data...")
pre_outlier_df = base_pipeline.fit_transform(None)
pre_scaling_df = final_cleaning_pipeline.fit_transform(pre_outlier_df)
df = scaling_pipeline.fit_transform(pre_scaling_df)

print(f"Dataset shape: {df.shape}")
print(f"Available columns: {list(df.columns)}")

## 3. Define Target and Features


In [None]:
target_column = "estimated_owners_calculated"
y = df[target_column].copy()

columns_to_ignore = [
    target_column,
    "average_playtime_forever",
    "median_playtime_forever",
]

numeric_features = df.select_dtypes(include=[np.number]).columns
feature_columns = [col for col in numeric_features if col not in columns_to_ignore]

X = df[feature_columns].copy()

print(f"Target variable: {target_column}")
print(f"Target range: {y.min():.0f} - {y.max():.0f}")
print(f"Number of features: {len(feature_columns)}")
print(f"Features: {feature_columns}")
print("Target variable statistics:")
print(f"   Mean: {y.mean():.0f}")
print(f"   Median: {y.median():.0f}")
print(f"   Std: {y.std():.0f}")

fig = px.histogram(
    x=y,
    nbins=50,
    title="Distribution of Target Variable (Estimated Owners)",
    labels={"x": "Estimated Owners (scaled)", "y": "Frequency"},
    template="plotly_white",
)
fig.update_layout(showlegend=False, width=800, height=400)
fig.show()

## 4. Stratified Train-Test Split


In [None]:
n_bins = 5
y_binned = pd.cut(y, bins=n_bins, labels=False)

print("Bin distributions:")
bin_counts = pd.Series(y_binned).value_counts().sort_index()
for i, count in enumerate(bin_counts):
    print(f"   Bin {i}: {count} samples")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y_binned
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")

print(f"\nTarget statistics:")
print(f"   Train mean: {y_train.mean():.0f}")
print(f"   Test mean: {y_test.mean():.0f}")
print(f"   Train std: {y_train.std():.0f}")
print(f"   Test std: {y_test.std():.0f}")

## 5. Train XGBoost Model


In [None]:
print("Training XGBoost model...")
start_time = time.time()

xgb_model = XGBRegressor(
    n_estimators=125,  # Number of trees (boosting rounds)
    max_depth=35,  # Maximum tree depth
    min_child_weight=14,
    learning_rate=0.085,  # Step size shrinkage
    subsample=0.9,  # Subsample ratio of the training instances
    colsample_bytree=0.9,  # Subsample ratio of columns when constructing each tree
    objective="reg:squarederror",  # Regression objective
    random_state=42,  # For reproducibility
    n_jobs=-1,  # Use all CPU cores
)

xgb_model.fit(X_train, y_train)
training_time = time.time() - start_time

print(f"Training time: {training_time:.2f} seconds")
print(f"Number of trees: {xgb_model.n_estimators}")
print(f"Max depth: {xgb_model.max_depth}")
print(f"Learning rate: {xgb_model.learning_rate}")
print(f"Subsample: {xgb_model.subsample}")
print(f"Colsample bytree: {xgb_model.colsample_bytree}")
print(f"Min child weight: {xgb_model.min_child_weight}")
print(f"Number of features used: {xgb_model.n_features_in_}")

models_dir = "../models/xgb_regressor"
os.makedirs(models_dir, exist_ok=True)

model_filename = "model.joblib"
model_path = os.path.join(models_dir, model_filename)

joblib.dump(xgb_model, model_path)
print(f"\nModel saved to: {model_path}")

feature_columns_path = os.path.join(models_dir, "feature_columns.joblib")
joblib.dump(feature_columns, feature_columns_path)
print(f"Feature columns saved to: {feature_columns_path}")

scaling_pipeline_path = os.path.join(models_dir, "scaling_pipeline.joblib")
joblib.dump(scaling_pipeline, scaling_pipeline_path)
print(f"Scaling pipeline saved to: {scaling_pipeline_path}")

## 6. Evaluate Model Performance


In [None]:
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Model Performance Metrics:")
print("=" * 40)
print(f"R² Score:")
print(f"   Training: {train_r2:.3f}")
print(f"   Test: {test_r2:.3f}")
print(f"\nMean Absolute Error:")
print(f"   Training: {train_mae:.0f}")
print(f"   Test: {test_mae:.0f}")
print(f"\nRoot Mean Square Error:")
print(f"   Training: {train_rmse:.0f}")
print(f"   Test: {test_rmse:.0f}")
print(f"\nOverfitting measure (Train R² - Test R²): {train_r2 - test_r2:.3f}")

feature_importance = pd.DataFrame(
    {"feature": feature_columns, "importance": xgb_model.feature_importances_}
).sort_values("importance", ascending=False)

print(f"\nTop 10 Most Important Features:")
print("=" * 40)
for i, (feature, importance) in enumerate(feature_importance.head(10).values):
    print(f"{i + 1:2d}. {feature:<25} {importance:.3f}")

fig_importance = px.bar(
    feature_importance.head(10),
    x="importance",
    y="feature",
    orientation="h",
    title="Top 10 Feature Importances - XGBoost",
    labels={"importance": "Importance", "feature": "Features"},
    template="plotly_white",
)
fig_importance.update_layout(height=500, yaxis={"categoryorder": "total ascending"})
fig_importance.show()

pred_actual_df = pd.DataFrame({"actual": y_test, "predicted": y_test_pred})

fig_pred_actual = px.scatter(
    pred_actual_df,
    x="actual",
    y="predicted",
    title=f"Predicted vs Actual (Test Set) - R² = {test_r2:.3f}",
    labels={"actual": "Actual Values", "predicted": "Predicted Values"},
    template="plotly_white",
)

min_val = min(y_test.min(), y_test_pred.min())
max_val = max(y_test.max(), y_test_pred.max())
fig_pred_actual.add_scatter(
    x=[min_val, max_val],
    y=[min_val, max_val],
    mode="lines",
    name="Perfect Prediction",
    line=dict(color="red", dash="dash"),
)
fig_pred_actual.update_layout(height=500, width=600)
fig_pred_actual.show()

residuals = y_test - y_test_pred
residuals_df = pd.DataFrame({"predicted": y_test_pred, "residuals": residuals})

fig_residuals = px.scatter(
    residuals_df,
    x="predicted",
    y="residuals",
    title="Residual Plot - XGBoost",
    labels={"predicted": "Predicted Values", "residuals": "Residuals"},
    template="plotly_white",
)
fig_residuals.add_hline(y=0, line_dash="dash", line_color="red")
fig_residuals.update_layout(height=500, width=600)
fig_residuals.show()

fig_dist = px.histogram(
    x=[y_test_pred, y_test],
    nbins=30,
    title="Distribution Comparison: Predicted vs Actual - XGBoost",
    labels={"x": "Values", "y": "Frequency"},
    template="plotly_white",
    barmode="overlay",
    opacity=0.7,
)
fig_dist.data[0].name = "Predicted"
fig_dist.data[1].name = "Actual"
fig_dist.update_layout(height=500, width=600)
fig_dist.show()