# 03 - Modeling
Train and evaluate models.

In [None]:
# Imports for modeling
from pathlib import Path
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# Make project src importable
project_root = Path('..').resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.preprocessing import clean_dataset, add_hit_label, encode_categoricals

sns.set_theme(style="whitegrid")

In [None]:
# Load data
raw_path = Path('..') / 'data' / 'raw' / 'vg_sales_2024.csv'
df = pd.read_csv(raw_path)

# Clean + label + encode
df = clean_dataset(df)
df = add_hit_label(df, sales_col='total_sales', threshold=1.0, label_col='Hit')

# Choose features: numeric + encoded categoricals
feat_df = encode_categoricals(df, columns=("genre", "platform", "publisher"), drop_first=True)

# Separate X, y
y = feat_df['Hit']
X = feat_df.drop(columns=['Hit'])

# Keep only numeric features for models that expect numeric input
X = X.select_dtypes(include=['number']).copy()

print(f"X shape: {X.shape}, y positive rate: {y.mean():.3f}")

In [None]:
# Train/test split with stratification on Hit
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(
    f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}, "
    f"Pos rate (train/test): {y_train.mean():.3f}/{y_test.mean():.3f}"
)

In [None]:
# Logistic Regression baseline
log_reg = LogisticRegression(max_iter=2000, n_jobs=None)
log_reg.fit(X_train, y_train)

pred_lr = log_reg.predict(X_test)

print("Logistic Regression - Classification Report:\n")
from sklearn.metrics import classification_report as cr
print(cr(y_test, pred_lr, digits=3))

metrics_lr = {
    'accuracy': accuracy_score(y_test, pred_lr),
    'precision': precision_score(y_test, pred_lr, zero_division=0),
    'recall': recall_score(y_test, pred_lr, zero_division=0),
    'f1': f1_score(y_test, pred_lr, zero_division=0),
}
metrics_lr

In [None]:
# Random Forest classifier
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1,
)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)

print("Random Forest - Metrics:\n")
metrics_rf = {
    'accuracy': accuracy_score(y_test, pred_rf),
    'precision': precision_score(y_test, pred_rf, zero_division=0),
    'recall': recall_score(y_test, pred_rf, zero_division=0),
    'f1': f1_score(y_test, pred_rf, zero_division=0),
}
metrics_rf

In [None]:
# Compare Logistic Regression vs Random Forest
comparison = pd.DataFrame([metrics_lr, metrics_rf], index=["LogReg", "RandForest"]) 
print(comparison)
comparison

In [None]:
# Cross-validation for Random Forest
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}
cv_results = cross_validate(rf, X, y, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)

cv_summary = pd.DataFrame({k: v for k, v in cv_results.items() if k.startswith('test_')})
cv_summary.mean().to_frame('mean').join(cv_summary.std().to_frame('std'))

In [None]:
# Feature importance plot for Random Forest
import numpy as np

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
feat_names = X.columns.values

top_n = 20
sel = indices[:top_n]
plt.figure(figsize=(8, 8))
sns.barplot(x=importances[sel], y=feat_names[sel], palette="mako")
plt.title(f"Random Forest Feature Importance (Top {top_n})")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

## SHAP Explainability

Compute SHAP values for the Random Forest classifier to understand feature impact on the Hit prediction. We focus on the probability of the positive class (Hit = 1).

In [None]:
import shap
shap.initjs()

# Use a TreeExplainer for RandomForestClassifier
explainer = shap.TreeExplainer(rf, feature_names=X.columns)

# SHAP values for the test set; for classifiers, shap_values is a list per class
shap_values = explainer.shap_values(X_test)

# Determine positive class index (usually 1 for binary classification)
pos_class_idx = 1 if len(shap_values) > 1 else 0

# Summary plot (beeswarm) for positive class
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values[pos_class_idx], X_test, feature_names=X.columns, show=False)
plt.title("SHAP Summary Plot (Hit=1)")
plt.tight_layout()
plt.show()