# Lineup → xG Regression (OLS with Player Dummies)

This notebook fits a simple regression model that predicts **team xG** from the **starting XI**.

We create a dummy variable for each player (1 if in XI, else 0) and fit:

xG = α + Σ β_p * 1[p in XI] + ε

Optionally (recommended), we also include team fixed effects to capture baseline team strength:
xG = α + Σ β_p * 1[p in XI] + Σ δ_team * 1[team] + ε

Because player dummies are correlated (players co-occur), we also fit Ridge as a more stable predictive baseline.

Finally, you can enter your proposed optimal XI and get predicted xG.

In [1]:
import ast
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge

import statsmodels.api as sm

In [2]:
DATA_PATH = "match_team_lineups_xg.csv"  # change path if needed
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,match_id,team_id,team_name,opponent_id,opponent_name,starting_11_players,team_xg
0,122838,33,FC Bayern München,38,SV Werder Bremen,"['Joshua Kimmich', 'Leroy Sané', 'Harry Kane',...",2.58382
1,122838,38,SV Werder Bremen,33,FC Bayern München,"['Mitchell Weiser', 'Milos Veljkovic', 'Leonar...",0.80446
2,122839,37,RasenBallsport Leipzig,41,Bayer 04 Leverkusen,"['Timo Werner', 'Willi Orbán', 'Benjamin Henri...",1.34721
3,122839,41,Bayer 04 Leverkusen,37,RasenBallsport Leipzig,"['Granit Xhaka', 'Jonathan Tah', 'Lukas Hradec...",1.26938
4,122840,30,VfL Wolfsburg,432,1. FC Heidenheim 1846,"['Yannick Gerhardt', 'Koen Casteels', 'Jonas W...",2.39975


In [3]:
def parse_player_list(x):
    if isinstance(x, list):
        return [str(p).strip() for p in x]
    if pd.isna(x):
        return []
    s = str(x).strip()
    try:
        out = ast.literal_eval(s)
        if isinstance(out, list):
            return [str(p).strip() for p in out]
    except Exception:
        pass
    s = s.strip("[]")
    parts = [p.strip().strip("'").strip('"') for p in s.split(",") if p.strip()]
    return parts

df["xi_list"] = df["starting_11_players"].apply(parse_player_list)
df["xi_len"] = df["xi_list"].apply(len)

print("Rows:", len(df))
print(df["xi_len"].value_counts().head(10))

df = df[(df["xi_len"] == 11) & df["team_xg"].notna()].copy()
print("Rows after filtering:", len(df))
df.head()

Rows: 612
xi_len
11    612
Name: count, dtype: int64
Rows after filtering: 612


Unnamed: 0,match_id,team_id,team_name,opponent_id,opponent_name,starting_11_players,team_xg,xi_list,xi_len
0,122838,33,FC Bayern München,38,SV Werder Bremen,"['Joshua Kimmich', 'Leroy Sané', 'Harry Kane',...",2.58382,"[Joshua Kimmich, Leroy Sané, Harry Kane, Kings...",11
1,122838,38,SV Werder Bremen,33,FC Bayern München,"['Mitchell Weiser', 'Milos Veljkovic', 'Leonar...",0.80446,"[Mitchell Weiser, Milos Veljkovic, Leonardo Bi...",11
2,122839,37,RasenBallsport Leipzig,41,Bayer 04 Leverkusen,"['Timo Werner', 'Willi Orbán', 'Benjamin Henri...",1.34721,"[Timo Werner, Willi Orbán, Benjamin Henrichs, ...",11
3,122839,41,Bayer 04 Leverkusen,37,RasenBallsport Leipzig,"['Granit Xhaka', 'Jonathan Tah', 'Lukas Hradec...",1.26938,"[Granit Xhaka, Jonathan Tah, Lukas Hradecky, J...",11
4,122840,30,VfL Wolfsburg,432,1. FC Heidenheim 1846,"['Yannick Gerhardt', 'Koen Casteels', 'Jonas W...",2.39975,"[Yannick Gerhardt, Koen Casteels, Jonas Wind, ...",11


In [4]:
USE_TEAM_FIXED_EFFECTS = True  # recommended

mlb = MultiLabelBinarizer(sparse_output=False)
X_players = mlb.fit_transform(df["xi_list"])
player_cols = [f"p::{p}" for p in mlb.classes_]
X_players = pd.DataFrame(X_players, columns=player_cols, index=df.index)

if USE_TEAM_FIXED_EFFECTS:
    X_team = pd.get_dummies(df["team_name"], prefix="team", drop_first=True)
    X = pd.concat([X_players, X_team], axis=1)
else:
    X = X_players.copy()

y = df["team_xg"].astype(float)

print("Design matrix shape:", X.shape)
print("Num unique players:", len(player_cols))

Design matrix shape: (612, 438)
Num unique players: 421


In [None]:
import numpy as np
import statsmodels.api as sm

# 1) Force everything numeric and finite
X_num = X.apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan).fillna(0.0)
y_num = pd.to_numeric(y, errors="coerce").replace([np.inf, -np.inf], np.nan)

mask = y_num.notna()
X_num = X_num.loc[mask]
y_num = y_num.loc[mask]

# 2) Convert to numpy float arrays (this avoids the "dtype object" path in statsmodels)
X_np = sm.add_constant(X_num.to_numpy(dtype=float), has_constant="add")
y_np = y_num.to_numpy(dtype=float)

print("X_np dtype:", X_np.dtype, "shape:", X_np.shape)
print("y_np dtype:", y_np.dtype, "shape:", y_np.shape)

print("y dtype:", y.dtype)
print("X has object columns?", (X.dtypes == "object").any())
print("Object cols:", X.columns[X.dtypes == "object"].tolist()[:20])
print("Any NaN in X?", X.isna().any().any(), " Any inf in X?", np.isinf(X.select_dtypes(include=[np.number]).to_numpy()).any())

# 3) Fit OLS
ols_model = sm.OLS(y_np, X_np).fit(cov_type="HC3")
print(ols_model.summary())

const               0.816625
p::Aaron Seydel    -0.225369
p::Adam Hlozek      0.142294
p::Adrian Beck      0.051980
p::Alassane Pléa    0.812121
dtype: float64


In [None]:

def build_feature_row(team_name: str, xi: list[str]) -> pd.DataFrame:
    xi_set = set([str(p).strip() for p in xi])

    row_players = np.zeros(len(player_cols), dtype=int)
    for j, col in enumerate(player_cols):
        pname = col.replace("p::", "")
        row_players[j] = int(pname in xi_set)

    row = pd.DataFrame([row_players], columns=player_cols)

    if USE_TEAM_FIXED_EFFECTS:
        for c in team_cols:
            row[c] = 0

        # build the dummies in the same way as training (no drop_first here, then align)
        tmp = pd.get_dummies(pd.Series([team_name]), prefix="team")
        for c in tmp.columns:
            if c in row.columns:
                row.loc[0, c] = int(tmp.loc[0, c])

        # If training used drop_first=True, then the baseline team has no column.
        # This alignment handles that automatically (it just won't set anything).
        # 
    row = row.reindex(columns=X.columns, fill_value=0)
    return row





ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

ols_tt = sm.OLS(y_train, sm.add_constant(X_train, has_constant="add")).fit()
pred = ols_tt.predict(sm.add_constant(X_test, has_constant="add"))

rmse = mean_squared_error(y_test, pred, squared=False)
r2 = r2_score(y_test, pred)

print(f"OLS Test RMSE: {rmse:.4f}")
print(f"OLS Test R^2 : {r2:.4f}")

In [None]:
ridge_alpha = 1.0  # tune if needed
ridge = Ridge(alpha=ridge_alpha, fit_intercept=True, random_state=7)
ridge.fit(X_train, y_train)

pred_r = ridge.predict(X_test)
rmse_r = mean_squared_error(y_test, pred_r, squared=False)
r2_r = r2_score(y_test, pred_r)

print(f"Ridge(alpha={ridge_alpha}) Test RMSE: {rmse_r:.4f}")
print(f"Ridge(alpha={ridge_alpha}) Test R^2 : {r2_r:.4f}")

In [None]:
PROPOSED_TEAM_NAME = "FC Bayern München"
PROPOSED_XI = [
    # Replace with your optimizer output
    "Manuel Neuer",
    "Alphonso Davies",
    "Dayot Upamecano",
    "Min-jae Kim",
    "Joshua Kimmich",
    "Konrad Laimer",
    "Jamal Musiala",
    "Thomas Müller",
    "Kingsley Coman",
    "Harry Kane",
    "Leroy Sané",
}

assert len(PROPOSED_XI) == 11, "PROPOSED_XI must have exactly 11 players."

def build_feature_row(team_name: str, xi: list[str]) -> pd.DataFrame:
    xi_set = set([str(p).strip() for p in xi])

    row_players = np.zeros(len(player_cols), dtype=int)
    for j, col in enumerate(player_cols):
        pname = col.replace("p::", "")
        row_players[j] = int(pname in xi_set)

    row = pd.DataFrame([row_players], columns=player_cols)

    if USE_TEAM_FIXED_EFFECTS:
        for c in team_cols:
            row[c] = 0

        # build the dummies in the same way as training (no drop_first here, then align)
        tmp = pd.get_dummies(pd.Series([team_name]), prefix="team")
        for c in tmp.columns:
            if c in row.columns:
                row.loc[0, c] = int(tmp.loc[0, c])

        # If training used drop_first=True, then the baseline team has no column.
        # This alignment handles that automatically (it just won't set anything).
        # 
    row = row.reindex(columns=X.columns, fill_value=0)
    return row

x_row = build_feature_row(PROPOSED_TEAM_NAME, PROPOSED_XI)

pred_ols = ols_model.predict(sm.add_constant(x_row, has_constant="add")).iloc[0]

ridge_full = Ridge(alpha=ridge_alpha, fit_intercept=True, random_state=7).fit(X, y)
pred_ridge = ridge_full.predict(x_row)[0]

print("Predicted xG (OLS):  ", float(pred_ols))
print("Predicted xG (Ridge):", float(pred_ridge))

seen = set(mlb.classes_)
unseen = [p for p in PROPOSED_XI if p not in seen]
if unseen:
    print("\nWARNING: Unseen players (ignored in player dummies):")
    for p in unseen:
        print(" -", p)

In [None]:
coef = ols_model.params.drop(labels=["const"], errors="ignore")
player_coef = coef[[c for c in coef.index if c.startswith("p::")]].sort_values(ascending=False)

print("Top +xG players (OLS coefficient):")
display(player_coef.head(20))

print("\nTop -xG players (OLS coefficient):")
display(player_coef.tail(20))