In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# =========================
#  Load dataset
# =========================
df = pd.read_csv("defenders_5seasons_modelready.csv")


In [27]:


# -------------------------
# Encode tier_league for classification
# -------------------------
df["tier_league_class"] = pd.cut(
    df["tier_league"],
    bins=[-0.1, 0.16, 0.5, 0.8, 1.1],
    labels=[0, 1, 2, 3]
).astype(int)

# =========================
#  Tracking columns & target
# =========================
desired_tracking_cols = ['player', 'team', 'season', 'age', 'role', 'tier_league']
tracking_cols = [c for c in desired_tracking_cols if c in df.columns]

target_col = 'tier_league_class'
position_col = 'role'   

# Clean role values (CB / FB etc.)
df[position_col] = df[position_col].astype(str).str.strip().str.upper()

# =========================
#  Split into 3 datasets: All / CB / FB
# =========================
df_all = df.copy()
df_cb = df[df[position_col] == 'CB'].copy()
df_fb = df[df[position_col] == 'FB'].copy()

print("Rows - All:", len(df_all), "CB:", len(df_cb), "FB:", len(df_fb))

# =========================
#  Helper: prepare dataset 
# =========================
def prepare_dataset(df, tracking_cols, target_col):
    # Target (discrete classes)
    y = df[target_col].copy()

    # Features = everything except tracking + target
    X = df.drop(columns=tracking_cols + [target_col])

    # Tracking info (for later reference / joins)
    tracking_info = df[tracking_cols].copy()

    # Normalise features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=df.index)

    return X_scaled, y, tracking_info, scaler

# Prepare all three
X_all, y_all, track_all, scaler_all = prepare_dataset(df_all, tracking_cols, target_col)
X_cb,  y_cb,  track_cb,  scaler_cb  = prepare_dataset(df_cb,  tracking_cols, target_col)
X_fb,  y_fb,  track_fb,  scaler_fb  = prepare_dataset(df_fb,  tracking_cols, target_col)

# =========================
#  Save prepared datasets to CSV
# =========================
df_all_prepared = pd.concat([track_all, X_all, y_all.rename("tier_league_class")], axis=1)
df_cb_prepared  = pd.concat([track_cb,  X_cb,  y_cb.rename("tier_league_class")],  axis=1)
df_fb_prepared  = pd.concat([track_fb,  X_fb,  y_fb.rename("tier_league_class")],  axis=1)

df_all_prepared.to_csv("defenders_all_prepared.csv", index=False)
df_cb_prepared.to_csv("defenders_cb_prepared.csv", index=False)
df_fb_prepared.to_csv("defenders_fb_prepared.csv", index=False)

print("Saved:")
print(" - defenders_all_prepared.csv")
print(" - defenders_cb_prepared.csv")
print(" - defenders_fb_prepared.csv")

# =========================
#  Random Forest and metrics
# =========================
def train_random_forest(X, y, name="Model"):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    rf = RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{name} RESULTS")
    print("-" * 40)
    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return rf, (X_train, X_test, y_train, y_test)

# =========================
# Train models
# =========================
rf_all, (X_train_all, X_test_all, y_train_all, y_test_all) = \
    train_random_forest(X_all, y_all, name="Random Forest - All Defenders")

rf_cb, (X_train_cb, X_test_cb, y_train_cb, y_test_cb) = \
    train_random_forest(X_cb, y_cb, name="Random Forest - Centre Backs")

rf_fb, (X_train_fb, X_test_fb, y_train_fb, y_test_fb) = \
    train_random_forest(X_fb, y_fb, name="Random Forest - Full Backs")


Rows - All: 387 CB: 190 FB: 197
Saved:
 - defenders_all_prepared.csv
 - defenders_cb_prepared.csv
 - defenders_fb_prepared.csv

Random Forest - All Defenders RESULTS
----------------------------------------
Accuracy: 0.449
Macro F1: 0.373

Confusion Matrix:
 [[ 5  2 11  0]
 [ 3  1  5  3]
 [ 2  0 20  5]
 [ 1  0 11  9]]

Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.28      0.34        18
           1       0.33      0.08      0.13        12
           2       0.43      0.74      0.54        27
           3       0.53      0.43      0.47        21

    accuracy                           0.45        78
   macro avg       0.44      0.38      0.37        78
weighted avg       0.45      0.45      0.41        78


Random Forest - Centre Backs RESULTS
----------------------------------------
Accuracy: 0.316
Macro F1: 0.255

Confusion Matrix:
 [[5 0 3 1]
 [4 0 3 0]
 [3 1 5 3]
 [4 0 4 2]]

Classification Report:
               preci

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

def select_features(
    df,
    target_col,
    tracking_cols,
    corr_threshold=0.10,
    imp_threshold=0.02,
    redundancy_threshold=0.90,
    random_state=42
):
    """
    Automatic feature selection based on:
      - |correlation with target| >= corr_threshold  OR
      - RF feature importance >= imp_threshold
    and then removing highly-redundant features (|corr| between features >= redundancy_threshold)
    """

    
    numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()

    
    candidate_features = [
        c for c in numeric_cols if c not in tracking_cols + [target_col]
    ]

    # ---- Correlation with target ----
    corr_with_target = df[candidate_features + [target_col]].corr()[target_col].drop(target_col)
    corr_keep = corr_with_target[abs(corr_with_target) >= corr_threshold].index.tolist()

    # ---- Random Forest importance ----
    X = df[candidate_features]
    y = df[target_col]

    rf = RandomForestClassifier(
        n_estimators=400,
        class_weight="balanced",
        random_state=random_state,
        n_jobs=-1
    )
    rf.fit(X, y)

    importances = pd.Series(rf.feature_importances_, index=candidate_features)
    imp_keep = importances[importances >= imp_threshold].index.tolist()

    
    initial_selected = sorted(list(set(corr_keep) | set(imp_keep)))

    # Remove redundant features 
    if len(initial_selected) == 0:
        raise ValueError("No features selected")

    corr_matrix = df[initial_selected].corr().abs()

    # greedy procedure
    selected_final = []
    for col in initial_selected:
        
        if all(corr_matrix.loc[col, kept] < redundancy_threshold for kept in selected_final):
            selected_final.append(col)

    
    selected_final = sorted(selected_final)

    return {
        "selected_features": selected_final,
        "corr_with_target": corr_with_target.sort_values(ascending=False),
        "rf_importances": importances.sort_values(ascending=False)
    }


In [31]:

# feature selection on ALL defenders 
fs_result = select_features(
    df=df,
    target_col=target_col,
    tracking_cols=tracking_cols,
    corr_threshold=0.10,
    imp_threshold=0.02,
    redundancy_threshold=0.90,
    random_state=42
)

selected_features = fs_result["selected_features"]

print("\n=== Final selected features ===")
for f in selected_features:
    print(f)





=== Final selected features (after all rules) ===
aerial_duels_won_per90
aerial_win_pct
assists_per90
att_90
att_pen
avg_pass_distance
carries_per90
carry_progressive_distance
clearances
defensive_actions_in_penalty_area
expected_assists
expected_goals_plus_assists
fouls_committed
goal_creating_actions_per90
goals_plus_assists_per90
has_goals_shot_event
has_goals_shot_on_target_event
interceptions
non_penalty_expected_goals
non_penalty_xg_per_shot
pass_completion_pct
passes_completed_into_penalty_area
passes_into_penalty_area
progressive_carries
progressive_passes
progressive_passes_received
recoveries_per90_extra
red_cards
shot_creating_actions_per90
shots_on_target_pct
shots_per90
tackle_success_pct
tackles_in_defensive_third
tackles_won
touches_per90


In [33]:
print("\nTop correlations with target:\n", fs_result["corr_with_target"].head(20))
print("\nTop RF feature importances:\n", fs_result["rf_importances"].head(20))


Top correlations with target:
 carries_per90                     0.364662
recoveries_per90                  0.358882
carry_progressive_distance        0.354252
completed_passes_per90            0.341681
touches_per90                     0.318845
att_pen                           0.315823
pass_completion_pct               0.256367
progressive_passes                0.203772
goals_plus_assists_per90          0.193972
assists_per90                     0.187246
progressive_carries               0.181344
live_ball_passes                  0.177320
goal_creating_actions_per90       0.172552
shots_per90                       0.165421
progressive_passes_received       0.163220
aerial_win_pct                    0.160419
non_penalty_expected_goals        0.159069
has_goals_shot_on_target_event    0.157196
expected_assists                  0.147811
successful_actions                0.146196
Name: tier_league_class, dtype: float64

Top RF feature importances:
 carries_per90                        0

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

position_col = "role"
df[position_col] = df[position_col].astype(str).str.strip().str.upper()

# ========= Split into ALL / CB / FB =========
df_all = df.copy()
df_cb = df[df[position_col] == "CB"].copy()
df_fb = df[df[position_col] == "FB"].copy()

print("Rows - All:", len(df_all), "CB:", len(df_cb), "FB:", len(df_fb))

# ========= prepare dataset =========
def prepare_dataset(df_subset, tracking_cols, target_col, feature_list):
    # restrict features to those present in this subset
    feature_cols = [c for c in feature_list if c in df_subset.columns]

    y = df_subset[target_col].copy()
    X = df_subset[feature_cols].copy()

    tracking_info = df_subset[tracking_cols].copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=df_subset.index)

    return X_scaled, y, tracking_info, scaler

# Prepare all three datasets
X_all, y_all, track_all, scaler_all = prepare_dataset(df_all, tracking_cols, target_col, selected_features)
X_cb,  y_cb,  track_cb,  scaler_cb  = prepare_dataset(df_cb,  tracking_cols, target_col, selected_features)
X_fb,  y_fb,  track_fb,  scaler_fb  = prepare_dataset(df_fb,  tracking_cols, target_col, selected_features)


df_all_prepared = pd.concat([track_all, X_all, y_all.rename("tier_league_class")], axis=1)
df_cb_prepared  = pd.concat([track_cb,  X_cb,  y_cb.rename("tier_league_class")],  axis=1)
df_fb_prepared  = pd.concat([track_fb,  X_fb,  y_fb.rename("tier_league_class")],  axis=1)

df_all_prepared.to_csv("defenders_all_prepared_reduced_features.csv", index=False)
df_cb_prepared.to_csv("defenders_cb_prepared_reduced_features.csv", index=False)
df_fb_prepared.to_csv("defenders_fb_prepared_reduced_features.csv", index=False)

print("\nSaved reduced-feature datasets:")
print(" - defenders_all_prepared_reduced_features.csv")
print(" - defenders_cb_prepared_reduced_features.csv")
print(" - defenders_fb_prepared_reduced_features.csv")

# ========= RF training helper =========
def train_random_forest(X, y, name="Model"):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    rf = RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{name} RESULTS")
    print("-" * 40)
    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return rf, (X_train, X_test, y_train, y_test)

# ========= Train models on reduced-feature datasets =========
rf_all, _ = train_random_forest(X_all, y_all, name="Random Forest - All Defenders (reduced features)")
rf_cb,  _ = train_random_forest(X_cb,  y_cb,  name="Random Forest - Centre Backs (reduced features)")
rf_fb,  _ = train_random_forest(X_fb,  y_fb,  name="Random Forest - Full Backs (reduced features)")


Rows - All: 387 CB: 190 FB: 197

Saved reduced-feature datasets:
 - defenders_all_prepared_reduced_features.csv
 - defenders_cb_prepared_reduced_features.csv
 - defenders_fb_prepared_reduced_features.csv

Random Forest - All Defenders (reduced features) RESULTS
----------------------------------------
Accuracy: 0.474
Macro F1: 0.406

Confusion Matrix:
 [[ 7  1 10  0]
 [ 2  1  7  2]
 [ 1  0 20  6]
 [ 2  0 10  9]]

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.39      0.47        18
           1       0.50      0.08      0.14        12
           2       0.43      0.74      0.54        27
           3       0.53      0.43      0.47        21

    accuracy                           0.47        78
   macro avg       0.51      0.41      0.41        78
weighted avg       0.50      0.47      0.44        78


Random Forest - Centre Backs (reduced features) RESULTS
----------------------------------------
Accuracy: 0.316
Macro F1: 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [39]:
from imblearn.over_sampling import SMOTE


In [41]:
def train_random_forest_with_smote(X, y, name="Model"):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Apply SMOTE only on training data
    sm = SMOTE(random_state=42, k_neighbors=3)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    rf = RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )

    rf.fit(X_train_res, y_train_res)
    y_pred = rf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{name} (SMOTE) RESULTS")
    print("-" * 40)
    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return rf


# Train models on all 3 datasets with SMOTE
rf_all_smote = train_random_forest_with_smote(X_all, y_all, name="Random Forest - All Defenders")
rf_cb_smote  = train_random_forest_with_smote(X_cb,  y_cb,  name="Random Forest - Centre Backs")
rf_fb_smote  = train_random_forest_with_smote(X_fb,  y_fb,  name="Random Forest - Full Backs")


Random Forest - All Defenders (SMOTE) RESULTS
----------------------------------------
Accuracy: 0.423
Macro F1: 0.382

Confusion Matrix:
 [[ 6  4  8  0]
 [ 4  2  4  2]
 [ 5  0 16  6]
 [ 3  0  9  9]]

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.33      0.33        18
           1       0.33      0.17      0.22        12
           2       0.43      0.59      0.50        27
           3       0.53      0.43      0.47        21

    accuracy                           0.42        78
   macro avg       0.41      0.38      0.38        78
weighted avg       0.42      0.42      0.41        78


Random Forest - Centre Backs (SMOTE) RESULTS
----------------------------------------
Accuracy: 0.342
Macro F1: 0.3

Confusion Matrix:
 [[5 1 3 0]
 [5 0 2 0]
 [3 2 5 2]
 [4 1 2 3]]

Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.56      0.38         9
           1       0.00

In [69]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report


# Helper

def load_prepared_dataset(path):
    df = pd.read_csv(path)
    
    # Tracking columns
    tracking_cols = [c for c in ["player", "team", "season", "age", "role", "tier_league"]
                     if c in df.columns]
    
    target_col = "tier_league_class"
    
    y = df[target_col].copy()
    X = df.drop(columns=tracking_cols + [target_col])
    
    return X, y


# Helper

def evaluate_sklearn_model(model, X, y, name="Model"):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"\n{name} RESULTS")
    print("-" * 40)
    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [71]:
from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

def evaluate_tabpfn(X, y, name="TabPFN"):
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Convert to numpy
    X_train_np = X_train.to_numpy().astype("float32")
    X_test_np  = X_test.to_numpy().astype("float32")
    y_train_np = y_train.to_numpy()
    y_test_np  = y_test.to_numpy()

    clf = TabPFNClassifier(device="cpu")  

    clf.fit(X_train_np, y_train_np)

    
    y_pred = clf.predict(X_test_np)

    
    acc = accuracy_score(y_test_np, y_pred)
    f1 = f1_score(y_test_np, y_pred, average="macro")
    cm = confusion_matrix(y_test_np, y_pred)

    print(f"\n{name} RESULTS")
    print("-" * 40)
    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test_np, y_pred))



In [73]:
from xgboost import XGBClassifier

def build_xgb_classifier(num_classes=4):
    return XGBClassifier(
        n_estimators=400,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softprob",
        num_class=num_classes,
        eval_metric="mlogloss",
        tree_method="hist",
        random_state=42
    )


In [75]:
# -------- All defenders --------
X_all, y_all = load_prepared_dataset("defenders_all_prepared_reduced_features.csv")

evaluate_tabpfn(X_all, y_all, name="TabPFN - All Defenders")

xgb_all = build_xgb_classifier(num_classes=len(np.unique(y_all)))
evaluate_sklearn_model(xgb_all, X_all, y_all, name="XGBoost - All Defenders")

# -------- Centre backs --------
X_cb, y_cb = load_prepared_dataset("defenders_cb_prepared_reduced_features.csv")

evaluate_tabpfn(X_cb, y_cb, name="TabPFN - Centre Backs")

xgb_cb = build_xgb_classifier(num_classes=len(np.unique(y_cb)))
evaluate_sklearn_model(xgb_cb, X_cb, y_cb, name="XGBoost - Centre Backs")

# -------- Full backs --------
X_fb, y_fb = load_prepared_dataset("defenders_fb_prepared_reduced_features.csv")

evaluate_tabpfn(X_fb, y_fb, name="TabPFN - Full Backs")

xgb_fb = build_xgb_classifier(num_classes=len(np.unique(y_fb)))
evaluate_sklearn_model(xgb_fb, X_fb, y_fb, name="XGBoost - Full Backs")


Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



TabPFN - All Defenders RESULTS
----------------------------------------
Accuracy: 0.436
Macro F1: 0.348

Confusion Matrix:
 [[ 8  0 10  0]
 [ 3  0  6  3]
 [ 3  0 18  6]
 [ 2  0 11  8]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.44      0.47        18
           1       0.00      0.00      0.00        12
           2       0.40      0.67      0.50        27
           3       0.47      0.38      0.42        21

    accuracy                           0.44        78
   macro avg       0.34      0.37      0.35        78
weighted avg       0.38      0.44      0.40        78



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



XGBoost - All Defenders RESULTS
----------------------------------------
Accuracy: 0.436
Macro F1: 0.374

Confusion Matrix:
 [[ 4  4  9  1]
 [ 2  2  4  4]
 [ 3  0 19  5]
 [ 3  0  9  9]]

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.22      0.27        18
           1       0.33      0.17      0.22        12
           2       0.46      0.70      0.56        27
           3       0.47      0.43      0.45        21

    accuracy                           0.44        78
   macro avg       0.40      0.38      0.37        78
weighted avg       0.42      0.44      0.41        78


TabPFN - Centre Backs RESULTS
----------------------------------------
Accuracy: 0.263
Macro F1: 0.212

Confusion Matrix:
 [[5 0 3 1]
 [3 0 4 0]
 [3 1 4 4]
 [2 0 7 1]]

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.56      0.45         9
           1       0.00      0.00      0.00       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



XGBoost - Full Backs RESULTS
----------------------------------------
Accuracy: 0.475
Macro F1: 0.442

Confusion Matrix:
 [[3 1 5 0]
 [2 1 2 0]
 [5 0 7 3]
 [0 0 3 8]]

Classification Report:
               precision    recall  f1-score   support

           0       0.30      0.33      0.32         9
           1       0.50      0.20      0.29         5
           2       0.41      0.47      0.44        15
           3       0.73      0.73      0.73        11

    accuracy                           0.47        40
   macro avg       0.48      0.43      0.44        40
weighted avg       0.48      0.47      0.47        40



In [77]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report


In [79]:
pip install lightgbm xgboost mord


Collecting lightgbmNote: you may need to restart the kernel to use updated packages.

  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting mord
  Downloading mord-0.7.tar.gz (8.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 6.9 MB/s eta 0:00:00
Building wheels for collected packages: mord
  Building wheel for mord (setup.py): started
  Building wheel for mord (setup.py): finished with status 'done'
  Created wheel for mord: filename=mord-0.7-py3-none-any.whl size=9897 sha256=bb362925db0707d7a4ceba0d44033573ba140b681c01c56825eb8871d73ed158
  Stored in directory: c:\users\jsava\appdata\local\pip\cache\wheels\80\3e\3b\13f1adf346cad0fec675db328e4b0d814795c

In [109]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import mord


In [111]:
def add_3class_labels(path_in, path_out):
    df = pd.read_csv(path_in)
    
    
    if "tier_league_class" not in df.columns:
        raise ValueError(f"'tier_league_class' not found in {path_in}")
    
    # Map 4-class -> 3-class
    mapping = {
        0: 0,  # poor
        1: 1,  # mid
        2: 1,  # mid
        3: 2   # top
    }
    df["tier_3class"] = df["tier_league_class"].map(mapping).astype(int)
    
    
    df.to_csv(path_out, index=False)
    print(f"Saved 3-class file to: {path_out}")
    return df

# All defenders
df_all_3 = add_3class_labels(
    "defenders_all_prepared_reduced_features.csv",
    "defenders_all_3class.csv"
)

# Centre backs
df_cb_3 = add_3class_labels(
    "defenders_cb_prepared_reduced_features.csv",
    "defenders_cb_3class.csv"
)

# Full backs
df_fb_3 = add_3class_labels(
    "defenders_fb_prepared_reduced_features.csv",
    "defenders_fb_3class.csv"
)


Saved 3-class file to: defenders_all_3class.csv
Saved 3-class file to: defenders_cb_3class.csv
Saved 3-class file to: defenders_fb_3class.csv


In [113]:
# Tracking columns
TRACKING_COLS = ["player", "team", "season", "age", "role", "tier_league", "tier_league_class"]

def load_3class_dataset(path, target_col="tier_3class"):
    df = pd.read_csv(path)
    
    # Keep tracking cols only if they exist
    tracking_cols = [c for c in TRACKING_COLS if c in df.columns]
    
    y = df[target_col].copy()
    X = df.drop(columns=tracking_cols + [target_col])
    
    return X, y, tracking_cols

def evaluate_model_clf(model, X, y, name="Model"):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"\n{name} RESULTS")
    print("-" * 40)
    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    return model, (X_train, X_test, y_train, y_test, y_pred)


In [119]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import warnings


warnings.filterwarnings("ignore")

def evaluate_lgbm_clean(X, y, name="LightGBM"):
    print(f"\n{name} RESULTS")
    print("-" * 40)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # LightGBM
    model = LGBMClassifier(
        objective="multiclass",
        num_class=3,
        n_estimators=400,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        class_weight="balanced",
        random_state=42,
        verbose=-100,               
        importance_type="gain"
    )

    
    model.set_params(**{"callbacks": []})

    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average="macro")
    cm  = confusion_matrix(y_test, y_pred)

    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))



In [121]:
X_all, y_all, _ = load_3class_dataset("defenders_all_3class.csv")
evaluate_lgbm_clean(X_all, y_all, name="LightGBM - All Defenders (3-class)")

X_cb, y_cb, _ = load_3class_dataset("defenders_cb_3class.csv")
evaluate_lgbm_clean(X_cb, y_cb, name="LightGBM - Centre Backs (3-class)")

X_fb, y_fb, _ = load_3class_dataset("defenders_fb_3class.csv")
evaluate_lgbm_clean(X_fb, y_fb, name="LightGBM - Full Backs (3-class)")




LightGBM - All Defenders (3-class) RESULTS
----------------------------------------
Accuracy: 0.538
Macro F1: 0.507

Confusion Matrix:
 [[ 8 10  0]
 [ 5 26  8]
 [ 2 11  8]]

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.44      0.48        18
           1       0.55      0.67      0.60        39
           2       0.50      0.38      0.43        21

    accuracy                           0.54        78
   macro avg       0.53      0.50      0.51        78
weighted avg       0.53      0.54      0.53        78


LightGBM - Centre Backs (3-class) RESULTS
----------------------------------------
Accuracy: 0.553
Macro F1: 0.528

Confusion Matrix:
 [[ 3  5  1]
 [ 1 12  6]
 [ 0  4  6]]

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.33      0.46         9
           1       0.57      0.63      0.60        19
           2       0.46      0.60      0.52        10

    

In [89]:
def evaluate_mord_ordinal(X, y, name="Ordinal LogisticAT"):
    # Mord expects numpy arrays
    X_np = X.to_numpy()
    y_np = y.to_numpy()
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_np, y_np, test_size=0.2, stratify=y_np, random_state=42
    )
    
    model = mord.LogisticAT(alpha=1.0)  
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"\n{name} RESULTS")
    print("-" * 40)
    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    return model, (X_train, X_test, y_train, y_test, y_pred)


X_all, y_all, _ = load_3class_dataset("defenders_all_3class.csv")
ord_all, _ = evaluate_mord_ordinal(X_all, y_all, name="Ordinal LogisticAT - All Defenders (3-class)")

X_cb, y_cb, _ = load_3class_dataset("defenders_cb_3class.csv")
ord_cb, _ = evaluate_mord_ordinal(X_cb, y_cb, name="Ordinal LogisticAT - Centre Backs (3-class)")

X_fb, y_fb, _ = load_3class_dataset("defenders_fb_3class.csv")
ord_fb, _ = evaluate_mord_ordinal(X_fb, y_fb, name="Ordinal LogisticAT - Full Backs (3-class)")



Ordinal LogisticAT - All Defenders (3-class) RESULTS
----------------------------------------
Accuracy: 0.526
Macro F1: 0.425

Confusion Matrix:
 [[ 3 15  0]
 [ 4 32  3]
 [ 0 15  6]]

Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.17      0.24        18
           1       0.52      0.82      0.63        39
           2       0.67      0.29      0.40        21

    accuracy                           0.53        78
   macro avg       0.54      0.42      0.42        78
weighted avg       0.54      0.53      0.48        78


Ordinal LogisticAT - Centre Backs (3-class) RESULTS
----------------------------------------
Accuracy: 0.579
Macro F1: 0.495

Confusion Matrix:
 [[ 2  6  1]
 [ 1 16  2]
 [ 0  6  4]]

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.22      0.33         9
           1       0.57      0.84      0.68        19
           2       0.57      0.40      

In [91]:
def build_xgb(num_classes=3):
    model = XGBClassifier(
        objective="multi:softprob",
        num_class=num_classes,
        eval_metric="mlogloss",
        tree_method="hist",
        random_state=42,
        use_label_encoder=False
    )
    return model


In [93]:
xgb_param_dist = {
    "n_estimators": [200, 400, 600],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 0.5, 1.0]
}


In [95]:
from scipy.stats import randint, uniform

def tune_xgb(X, y, name="XGBoost (tuned)", n_iter=25):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    num_classes = len(np.unique(y))
    base_model = build_xgb(num_classes=num_classes)
    
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=xgb_param_dist,
        n_iter=n_iter,
        scoring="f1_macro",
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    
    print("\nBest parameters found:")
    print(search.best_params_)
    
    # Evaluate on hold-out test set
    y_pred = best_model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"\n{name} RESULTS (best model)")
    print("-" * 40)
    print("Accuracy:", round(acc, 3))
    print("Macro F1:", round(f1, 3))
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    return best_model, search


In [97]:
# All defenders
X_all, y_all, _ = load_3class_dataset("defenders_all_3class.csv")
xgb_best_all, xgb_search_all = tune_xgb(X_all, y_all, name="XGBoost - All Defenders (3-class tuned)", n_iter=30)

# Centre backs
X_cb, y_cb, _ = load_3class_dataset("defenders_cb_3class.csv")
xgb_best_cb, xgb_search_cb = tune_xgb(X_cb, y_cb, name="XGBoost - Centre Backs (3-class tuned)", n_iter=20)

# Full backs
X_fb, y_fb, _ = load_3class_dataset("defenders_fb_3class.csv")
xgb_best_fb, xgb_search_fb = tune_xgb(X_fb, y_fb, name="XGBoost - Full Backs (3-class tuned)", n_iter=20)


Fitting 3 folds for each of 30 candidates, totalling 90 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters found:
{'subsample': 0.9, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 1.0, 'colsample_bytree': 0.8}

XGBoost - All Defenders (3-class tuned) RESULTS (best model)
----------------------------------------
Accuracy: 0.423
Macro F1: 0.331

Confusion Matrix:
 [[ 2 16  0]
 [ 6 26  7]
 [ 2 14  5]]

Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.11      0.14        18
           1       0.46      0.67      0.55        39
           2       0.42      0.24      0.30        21

    accuracy                           0.42        78
   macro avg       0.36      0.34      0.33        78
weighted avg       0.39      0.42      0.39        78

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters found:
{'subsample': 0.8, 'n_estimators': 600, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 1.0, 'colsample_bytree': 0.8}

XGBoost - Centre Backs (3-class tuned) RESULTS (best model)
----------------------------------------
Accuracy: 0.711
Macro F1: 0.699

Confusion Matrix:
 [[ 5  4  0]
 [ 1 15  3]
 [ 0  3  7]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.56      0.67         9
           1       0.68      0.79      0.73        19
           2       0.70      0.70      0.70        10

    accuracy                           0.71        38
   macro avg       0.74      0.68      0.70        38
weighted avg       0.72      0.71      0.71        38

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters found:
{'subsample': 0.7, 'n_estimators': 600, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 1.0, 'colsample_bytree': 0.7}

XGBoost - Full Backs (3-class tuned) RESULTS (best model)
----------------------------------------
Accuracy: 0.525
Macro F1: 0.491

Confusion Matrix:
 [[ 2  7  0]
 [ 4 10  6]
 [ 0  2  9]]

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.22      0.27         9
           1       0.53      0.50      0.51        20
           2       0.60      0.82      0.69        11

    accuracy                           0.53        40
   macro avg       0.49      0.51      0.49        40
weighted avg       0.50      0.53      0.51        40

