# 02 - Modeling

> Scaffold notebook — fill with data and code.

## Setup

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss


## Feature engineering

In [None]:
DATA_DIR = Path("../data/processed")
df = pd.read_csv(DATA_DIR / "shots.csv")

df.head(), len(df)
df["is_goal"] = df["shot.outcome.name"].apply(lambda x: 1 if x == "Goal" else 0)
df["is_goal"].value_counts(normalize=True)
# StatsBomb coordinate system: x=0 savunma, x=120 hücum
# Kale merkez koordinatı: (120, 40)

goal_x, goal_y = 120, 40

df["distance"] = np.sqrt((goal_x - df["x"])**2 + (goal_y - df["y"])**2)

# angle hesaplama (basit iki-boyutlu açı)
df["angle"] = np.arctan2(abs(df["y"] - goal_y), (goal_x - df["x"]))

df[["x", "y", "distance", "angle"]].head()


df["under_pressure"] = df["under_pressure"].fillna(False).astype(int)

df["body_foot"] = df["shot.body_part.name"].fillna("NA")
df = pd.get_dummies(df, columns=["body_foot"], drop_first=True)

df.head()


features = ["distance", "angle", "under_pressure"] + [c for c in df.columns if c.startswith("body_foot_")]
X = df[features]
y = df["is_goal"]

X.head(), y.head()


## Baseline model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)



## Predict + Brier Score

In [None]:
probs = model.predict_proba(X_test)[:, 1]
brier = brier_score_loss(y_test, probs)
brier


## Guess your own XG

In [None]:
df["xg_model"] = model.predict_proba(X)[:, 1]

out_path = DATA_DIR / "shots_with_xg.csv"
df.to_csv(out_path, index=False)

print("Saved:", out_path)


import joblib
joblib.dump(model, "../data/processed/xg_model.pkl")
