# 🧠 Building an Expected Goals (xG) Model
This notebook processes football shot data to train an expected goals (xG) model using logistic regression.

In [None]:
# Uncomment and run if dependencies are not installed
# !pip install pandas numpy matplotlib seaborn mplsoccer scikit-learn


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss, classification_report, make_scorer
from mplsoccer import Pitch
from matplotlib.colors import LinearSegmentedColormap
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load data
file_path = os.path.expanduser('~/Documents/GitHub/youtube-videos/data/xg_model.csv')
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {file_path}")


In [None]:
print("Initial shape:", df.shape)
print(df[['x', 'y']].agg(['min', 'max']))
print("Missing values:\n", df.isna().sum())

# Drop irrelevant or unwanted columns
cols_to_drop = ['DirectFreekick', 'DirectCorner', 'OwnGoal']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Drop own goals if still present
df = df[df.get('OwnGoal') != True] if 'OwnGoal' in df else df

# Fill NaNs
df.fillna(0, inplace=True)


In [None]:
# Cast to appropriate types
df = df.astype({
    'x': float, 'y': float, 'is_goal': bool, 'period': str,
    'Assisted': bool, 'Zone': str, 'IndividualPlay': bool, 'RegularPlay': bool,
    'LeftFoot': bool, 'RightFoot': bool, 'FromCorner': bool, 'FirstTouch': bool,
    'Head': bool, 'BigChance': bool, 'SetPiece': bool, 'Volley': bool,
    'FastBreak': bool, 'ThrowinSetPiece': bool, 'Penalty': bool,
    'OneOnOne': bool, 'KeyPass': bool, 'OtherBodyPart': bool
}, errors='ignore')


In [None]:
# Shot distance from goal (100, 50)
df["shot_distance"] = np.hypot(df["x"] - 100, df["y"] - 50)


In [None]:
df = pd.get_dummies(df, columns=["period", "Zone"], drop_first=True)


In [None]:
X = df.drop('is_goal', axis=1)
y = df['is_goal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("Log Loss:", log_loss(y_test, y_pred_proba))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))
print("Brier Score:", brier_score_loss(y_test, y_pred_proba))


In [None]:
final_df = X_test.copy()
final_df['goal_probability'] = y_pred_proba

pitch = Pitch(pitch_type='opta')
colors = ['red', 'yellow', 'green']
cmap = LinearSegmentedColormap.from_list('xg_cmap', colors)

fig, ax = pitch.draw(figsize=(10, 8))
sc = pitch.scatter(final_df['x'], final_df['y'], c=final_df['goal_probability'],
                   cmap=cmap, edgecolors='black', linewidth=0.5, s=100, ax=ax)
cbar = plt.colorbar(sc, ax=ax, orientation='vertical', fraction=0.02, pad=0.02)
cbar.set_label('xG Probability')
plt.title('Shot Map Colored by xG')
plt.tight_layout()
plt.show()


In [None]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear'],
    'max_iter': [100, 500, 1000]
}

scorer = make_scorer(roc_auc_score, needs_proba=True)
grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring=scorer, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score (CV):", grid_search.best_score_)
print("Test ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))

y_pred = (y_pred_proba > 0.5).astype(int)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
