In [19]:
# ================================================
# 📦 1. Load Dependencies and Define Paths
# ================================================
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from tqdm import tqdm

# Define paths
DATA_PATH = "../data/"
MODEL_PATH = "../models/saved/"
os.makedirs(MODEL_PATH, exist_ok=True)

In [20]:
# ================================================
# 📂 2. Load CSV Files
# ================================================
players = pd.read_csv(DATA_PATH + "players.csv")
transfers = pd.read_csv(DATA_PATH + "transfers.csv")
player_valuations = pd.read_csv(DATA_PATH + "player_valuations.csv")
appearances = pd.read_csv(DATA_PATH + "appearances.csv")
clubs = pd.read_csv(DATA_PATH + "clubs.csv")
competitions = pd.read_csv(DATA_PATH + "competitions.csv")
games = pd.read_csv(DATA_PATH + "games.csv")


In [21]:
# ================================================
# 🧹 3. Clean and Format Data
# ================================================
# Convert dates
players["date_of_birth"] = pd.to_datetime(players["date_of_birth"], errors="coerce")
players["contract_expiration_date"] = pd.to_datetime(players["contract_expiration_date"], errors="coerce")
transfers["transfer_date"] = pd.to_datetime(transfers["transfer_date"], errors="coerce")
player_valuations["date"] = pd.to_datetime(player_valuations["date"], errors="coerce")
appearances["date"] = pd.to_datetime(appearances["date"], errors="coerce")
games["date"] = pd.to_datetime(games["date"], errors="coerce")

# Add ref_date to each transfer (1 month before the transfer)
transfers["ref_date"] = transfers["transfer_date"] - pd.DateOffset(months=1)

In [22]:
# ================================================
# 🧠 4. Enrich Club Data (Historical Value, Squad Strength)
# ================================================
# Compute club total market value by year (last season available before transfer)
clubs["last_season"] = pd.to_numeric(clubs["last_season"], errors="coerce")
club_value_by_season = clubs.groupby(["club_id", "last_season"])["total_market_value"].mean().reset_index()

# Add league info to club
clubs = clubs.merge(competitions[["competition_id", "country_name"]],
                    left_on="domestic_competition_id",
                    right_on="competition_id", how="left")

In [None]:
# ================================================
# 🧹 5. Build Training Dataset with Player & Club Features
# ================================================
samples = []
print("\nBuilding dataset...")
for _, row in tqdm(transfers.iterrows(), total=len(transfers)):
    pid = row["player_id"]
    to_club = row["to_club_id"]
    ref_date = row["ref_date"]

    p = players[players["player_id"] == pid]
    if p.empty:
        continue
    p = p.iloc[0]

    val_hist = player_valuations[(player_valuations["player_id"] == pid) &
                                 (player_valuations["date"] < ref_date)]
    if val_hist.empty:
        continue

    # Stats
    val_now = val_hist.sort_values("date").iloc[-1]["market_value_in_eur"]
    val_mean = val_hist["market_value_in_eur"].mean()
    val_growth = (val_now - val_hist.iloc[0]["market_value_in_eur"]) / val_hist.iloc[0]["market_value_in_eur"] if val_hist.iloc[0]["market_value_in_eur"] > 0 else 0

    perf = appearances[(appearances["player_id"] == pid) & (appearances["date"] < ref_date)]
    goals = perf["goals"].sum()
    assists = perf["assists"].sum()
    mins = perf["minutes_played"].sum()
    matches = perf.shape[0]

    age = ref_date.year - p["date_of_birth"].year if pd.notnull(p["date_of_birth"]) else None
    height_in_cm = p["height_in_cm"]
    nationality = p["country_of_citizenship"]
    pos = p["position"]

    # Club info
    c = clubs[clubs["club_id"] == to_club]
    if c.empty:
        continue
    c = c.iloc[0]

    squad_val = c["total_market_value"]
    squad_size = c["squad_size"]
    avg_age = c["average_age"]
    foreign_pct = c["foreigners_percentage"]
    nat_players = c["national_team_players"]
    net_transfers = c["net_transfer_record"]
    club_country = c["country_name"]

    samples.append({
        "player_id": pid,
        "to_club_id": to_club,
        "age": age,
        "height_in_cm": height_in_cm,
        "nationality": nationality,
        "position": pos,
        "market_value_now": val_now,
        "market_value_mean": val_mean,
        "market_value_growth": val_growth,
        "goals": goals,
        "assists": assists,
        "minutes": mins,
        "matches": matches,
        "club_market_value": squad_val,
        "club_squad_size": squad_size,
        "club_avg_age": avg_age,
        "club_foreigners_pct": foreign_pct,
        "club_nat_players": nat_players,
        "club_net_transfer_record": net_transfers,
        "club_country": club_country
    })



Building dataset...


  2%|▏         | 1838/79556 [00:19<12:11, 106.23it/s]

In [11]:
# ================================================
# 🪼 6. Preprocess Dataset
# ================================================
data = pd.DataFrame(samples)

# Fill missing numeric values with 0
num_cols = [
    "age", "height_in_cm", "market_value_now", "market_value_mean", "market_value_growth",
    "goals", "assists", "minutes", "matches",
    "club_market_value", "club_squad_size", "club_avg_age",
    "club_foreigners_pct", "club_nat_players", "club_net_transfer_record"
]
data[num_cols] = data[num_cols].fillna(0)

# Fill missing categoricals with 'Unknown'
data["nationality"] = data["nationality"].fillna("Unknown")
data["position"] = data["position"].fillna("Unknown")
data["club_country"] = data["club_country"].fillna("Unknown")

# Remove rare classes with less than 2 samples
data_counts = data["to_club_id"].value_counts()
valid_clubs = data_counts[data_counts >= 2].index
data = data[data["to_club_id"].isin(valid_clubs)]

# Label encode target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["club_target"] = le.fit_transform(data["to_club_id"])

# Select features
features = [
    "age", "height_in_cm", "market_value_now", "market_value_mean", "market_value_growth",
    "goals", "assists", "minutes", "matches",
    "club_market_value", "club_squad_size", "club_avg_age",
    "club_foreigners_pct", "club_nat_players", "club_net_transfer_record"
]

# Convert categorical to numeric via one-hot or label encoding
data = pd.get_dummies(data, columns=["nationality", "position", "club_country"], drop_first=True)

# Split
X = data.drop(columns=["player_id", "to_club_id", "club_target"])
y = data["club_target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
# ================================================
# 🤖 7. Train EBM Model
# ================================================
print("\nTraining Explainable Boosting Machine (EBM)...")
ebm = ExplainableBoostingClassifier(interactions=10, random_state=42)
ebm.fit(X_train, y_train)


In [None]:
# ================================================
# 📊 8. Evaluate Model
# ================================================
y_pred = ebm.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
# ================================================
# 💾 9. Save Model and Encoder
# ================================================
joblib.dump(ebm, MODEL_PATH + "ebm_club_model.pkl")
joblib.dump(le, MODEL_PATH + "ebm_club_label_encoder.pkl")
print("\nEBM model and label encoder saved successfully.")
