# Model Training and Evaluation

This notebook loads the preprocessed shot data, trains several xG models (with and without geometry features), and evaluates them against actual goals and the StatsBomb xG values.

## Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, brier_score_loss

from xgboost import XGBClassifier

import matplotlib.pyplot as plt


## Load preprocessed shots

In [None]:
data_path = Path("shots_featured_wc2018.csv")
shots_featured = pd.read_csv(data_path)
shots_featured.head()


## Define target and feature sets

We keep `shot_statsbomb_xg` and `is_goal` for evaluation. We build two feature matrices:

- **Raw features** (`X_raw`): all features *except* distance and angle (this mimics your early models).
- **Featured geometry** (`X_feat`): all features including distance and angle.

In [None]:
# Target
y = shots_featured["is_goal"].astype(int)

# Base feature set (drop labels/statsbomb xg)
X_full = shots_featured.drop(["shot_statsbomb_xg", "is_goal"], axis=1)

# Raw features: no distance & angle
drop_geom = [col for col in ["distance", "angle"] if col in X_full.columns]
X_raw = X_full.drop(drop_geom, axis=1)

# Featured geometry = all features
X_feat = X_full.copy()

X_raw.head(), X_feat.head()


## Train/test split (shared indices)

We split once using `X_feat` and `y`, then use the resulting indices to build matching train/test sets for `X_raw`.


In [None]:
X_train_feat, X_test_feat, y_train, y_test = train_test_split(
    X_feat, y, test_size=0.2, random_state=42
)

# Align raw feature splits using the same indices
X_train_raw = X_raw.loc[X_train_feat.index]
X_test_raw = X_raw.loc[X_test_feat.index]

X_train_feat.shape, X_train_raw.shape, X_test_feat.shape, X_test_raw.shape


## 1. Logistic Regression (raw features, no geometry)

This corresponds to your early logistic regression model based on x, y and categorical/boolean features, but without distance and angle.

In [None]:
lr_raw = LogisticRegression(max_iter=2000)
lr_raw.fit(X_train_raw, y_train)

lr_raw_pred_train = lr_raw.predict_proba(X_train_raw)[:, 1]
lr_raw_pred_test = lr_raw.predict_proba(X_test_raw)[:, 1]


## 2. XGBoost (raw features, no geometry)

This is the analogue of your strong early XGBoost model that only used raw coordinates and simple features.

In [None]:
xgb_raw = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
)

xgb_raw.fit(X_train_raw, y_train)

xgb_raw_pred_train = xgb_raw.predict_proba(X_train_raw)[:, 1]
xgb_raw_pred_test = xgb_raw.predict_proba(X_test_raw)[:, 1]


## 3. Logistic Regression (featured geometry, unscaled)

This corresponds to your `lr_featured` model: distance and angle included, but no scaling.

In [None]:
lr_featured = LogisticRegression(max_iter=2000)
lr_featured.fit(X_train_feat, y_train)

lr_featured_pred_train = lr_featured.predict_proba(X_train_feat)[:, 1]
lr_featured_pred_test = lr_featured.predict_proba(X_test_feat)[:, 1]


## 4. XGBoost (featured geometry)

This corresponds to your `xgb_featured` model that uses distance and angle.

In [None]:
xgb_featured = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
)

xgb_featured.fit(X_train_feat, y_train)

xgb_featured_pred_train = xgb_featured.predict_proba(X_train_feat)[:, 1]
xgb_featured_pred_test = xgb_featured.predict_proba(X_test_feat)[:, 1]


## 5. Scaled Logistic Regression (featured geometry)

We standardize all features and then fit logistic regression. This corresponds to your `lr_scaled` model.

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_feat)
X_test_scaled = scaler.transform(X_test_feat)

lr_scaled = LogisticRegression(max_iter=2000)
lr_scaled.fit(X_train_scaled, y_train)

lr_scaled_pred_train = lr_scaled.predict_proba(X_train_scaled)[:, 1]
lr_scaled_pred_test = lr_scaled.predict_proba(X_test_scaled)[:, 1]


## Build comparison dataframe (test set)

We collect StatsBomb xG, actual goals, and predictions from all five models.

In [None]:
comparison_df = pd.DataFrame({
    "shot_statsbomb_xg": shots_featured.loc[X_test_feat.index, "shot_statsbomb_xg"].values,
    "is_goal": y_test.values,
    "lr_raw": lr_raw_pred_test,
    "xgb_raw": xgb_raw_pred_test,
    "lr_featured": lr_featured_pred_test,
    "xgb_featured": xgb_featured_pred_test,
    "lr_scaled": lr_scaled_pred_test,
})

comparison_df.head()


## Summary statistics

In [None]:
comparison_df.describe()


## Total xG vs actual goals

In [None]:
print("Actual goals (test):", comparison_df["is_goal"].sum())
print("StatsBomb total xG (test):", comparison_df["shot_statsbomb_xg"].sum())
print("LR raw total xG (test):", comparison_df["lr_raw"].sum())
print("XGB raw total xG (test):", comparison_df["xgb_raw"].sum())
print("LR featured total xG (test):", comparison_df["lr_featured"].sum())
print("XGB featured total xG (test):", comparison_df["xgb_featured"].sum())
print("LR scaled total xG (test):", comparison_df["lr_scaled"].sum())


## Correlation with StatsBomb xG

In [None]:
comparison_df[[
    "shot_statsbomb_xg",
    "lr_raw",
    "xgb_raw",
    "lr_featured",
    "xgb_featured",
    "lr_scaled",
]].corr()


## AUC and Brier scores vs actual goals

In [None]:
auc_lr_raw = roc_auc_score(comparison_df["is_goal"], comparison_df["lr_raw"])
auc_xgb_raw = roc_auc_score(comparison_df["is_goal"], comparison_df["xgb_raw"])
auc_lr_featured = roc_auc_score(comparison_df["is_goal"], comparison_df["lr_featured"])
auc_xgb_featured = roc_auc_score(comparison_df["is_goal"], comparison_df["xgb_featured"])
auc_lr_scaled = roc_auc_score(comparison_df["is_goal"], comparison_df["lr_scaled"])

brier_lr_raw = brier_score_loss(comparison_df["is_goal"], comparison_df["lr_raw"])
brier_xgb_raw = brier_score_loss(comparison_df["is_goal"], comparison_df["xgb_raw"])
brier_lr_featured = brier_score_loss(comparison_df["is_goal"], comparison_df["lr_featured"])
brier_xgb_featured = brier_score_loss(comparison_df["is_goal"], comparison_df["xgb_featured"])
brier_lr_scaled = brier_score_loss(comparison_df["is_goal"], comparison_df["lr_scaled"])

print("=== AUC ===")
print("LR raw:       ", auc_lr_raw)
print("XGB raw:      ", auc_xgb_raw)
print("LR featured:  ", auc_lr_featured)
print("XGB featured: ", auc_xgb_featured)
print("LR scaled:    ", auc_lr_scaled)

print("\n=== Brier ===")
print("LR raw:       ", brier_lr_raw)
print("XGB raw:      ", brier_xgb_raw)
print("LR featured:  ", brier_lr_featured)
print("XGB featured: ", brier_xgb_featured)
print("LR scaled:    ", brier_lr_scaled)


## Scatterplots vs StatsBomb xG

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharex=True, sharey=True)

axes[0].scatter(comparison_df["shot_statsbomb_xg"], comparison_df["lr_raw"], alpha=0.3)
axes[0].set_title("LR raw vs StatsBomb")
axes[0].set_xlabel("StatsBomb xG")
axes[0].set_ylabel("Model xG")

axes[1].scatter(comparison_df["shot_statsbomb_xg"], comparison_df["xgb_raw"], alpha=0.3)
axes[1].set_title("XGB raw vs StatsBomb")
axes[1].set_xlabel("StatsBomb xG")

axes[2].scatter(comparison_df["shot_statsbomb_xg"], comparison_df["xgb_featured"], alpha=0.3)
axes[2].set_title("XGB featured vs StatsBomb")
axes[2].set_xlabel("StatsBomb xG")

plt.tight_layout()
plt.show()


## (Optional) Save models to disk

Uncomment and run this cell if you want to persist the trained models and scaler as `.pkl` files.

In [None]:
# import joblib
# joblib.dump(lr_raw, "lr_raw_wc2018.pkl")
# joblib.dump(xgb_raw, "xgb_raw_wc2018.pkl")
# joblib.dump(lr_featured, "lr_featured_wc2018.pkl")
# joblib.dump(xgb_featured, "xgb_featured_wc2018.pkl")
# joblib.dump(lr_scaled, "lr_scaled_wc2018.pkl")
# joblib.dump(scaler, "scaler_wc2018.pkl")
# print("Saved models and scaler to disk.")
