In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor



DATA_DIR = Path("../data/processed")
POSITIONS = ["GK","DEF","MID","FWD"]
GW_CUTOFF = 34   # train on gameweeks ≤25, test on >25

results = {}

for pos in POSITIONS:
    # 1) load
    df = pd.read_parquet(DATA_DIR / f"features_2025_{pos}.parquet")

    # 2) define X, y
    y = df["gw_points"]
    X = df.drop(columns=["element","round","name","gw_points"])

    # 3) time‐aware split
    train_mask = df["round"] <= GW_CUTOFF
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]

    # 4) fit model
    model = GradientBoostingRegressor(
        n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42
    )
    model.fit(X_train, y_train)

    # 5) predict & evaluate
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae  = mean_absolute_error(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)

    results[pos] = {"rmse":rmse, "mae":mae, "r2":r2}
    print(f"{pos} → RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")


GK → RMSE: 1.516, MAE: 0.680, R²: 0.366
DEF → RMSE: 1.962, MAE: 1.047, R²: 0.172
MID → RMSE: 1.746, MAE: 0.948, R²: 0.351
FWD → RMSE: 2.233, MAE: 1.195, R²: 0.126

Position-wise performance:
GK: {'rmse': np.float64(1.5163308747195625), 'mae': 0.6799764721671734, 'r2': 0.36599179315228236}
DEF: {'rmse': np.float64(1.9619755296035113), 'mae': 1.0469233569358738, 'r2': 0.1724611318032715}
MID: {'rmse': np.float64(1.746071132450463), 'mae': 0.9478689364157643, 'r2': 0.35131150540699885}
FWD: {'rmse': np.float64(2.2327604406753943), 'mae': 1.1948905466947675, 'r2': 0.12602204996912347}


In [13]:
import pandas as pd
import numpy as np
from pathlib import Path

import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- 0) load your position‐specific data ---
DATA_DIR = Path("../data/processed")
df       = pd.read_parquet(DATA_DIR / "features_2025_GK.parquet")

target   = "gw_points"
drop_ids = ["element", "round", "name"]
X        = df.drop(columns=drop_ids + [target])
y        = df[target]

# --- 1) train/test split by gameweek ---
train_mask = df["round"] <= 34
X_train     = X.loc[train_mask]
X_test      = X.loc[~train_mask]
y_train     = y.loc[train_mask]
y_test      = y.loc[~train_mask]

# --- 2) hyperparameter tuning via GridSearchCV ---
param_grid = {
    "n_estimators":  [100, 200],
    "max_depth":     [3, 5, 7],
    "learning_rate": [0.01, 0.05],
    "subsample":     [0.6, 0.8, 1.0]
}

tscv = TimeSeriesSplit(n_splits=5)
lgb_base = LGBMRegressor(random_state=42, n_jobs=-1)

grid = GridSearchCV(
    estimator = lgb_base,
    param_grid= param_grid,
    cv        = tscv,
    scoring   = "neg_root_mean_squared_error",
    n_jobs    = -1
)

# note: we do NOT pass `verbose` or `early_stopping_rounds` here
grid.fit(X_train, y_train)

print("Best LGBM params:", grid.best_params_)
print("CV RMSE:", -grid.best_score_)

# --- 3) re‐fit best model with early stopping via callbacks ---
best_lgb = LGBMRegressor(**grid.best_params_, random_state=42, n_jobs=-1)
best_lgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="rmse",
    callbacks=[
        lgb.early_stopping(stopping_rounds=20, verbose=False),
        lgb.log_evaluation(0)
    ]
)

y_pred_lgb = best_lgb.predict(X_test)
print("\n--- LGBM Performance on Test ---")
print("RMSE:", mean_squared_error(y_test, y_pred_lgb, squared=False))
print("MAE :", mean_absolute_error(y_test, y_pred_lgb))
print("R²  :", r2_score(y_test, y_pred_lgb))

# --- 4) stacking ensemble (now with a simple KFold) ---
rf_base = RandomForestRegressor(n_estimators=100, max_depth=5,
                                random_state=42, n_jobs=-1)

stack = StackingRegressor(
    estimators=[("lgb", best_lgb), ("rf", rf_base)],
    final_estimator=LGBMRegressor(n_estimators=50, random_state=42),
    cv=KFold(n_splits=5, shuffle=False),
    n_jobs=-1
)

stack.fit(X_train, y_train)
y_pred_stack = stack.predict(X_test)

print("\n--- Stacking Ensemble on Test ---")
print("RMSE:", mean_squared_error(y_test, y_pred_stack, squared=False))
print("MAE :", mean_absolute_error(y_test, y_pred_stack))
print("R²  :", r2_score(y_test, y_pred_stack))

# --- 5) feature‐importance pruning and re‐train ---
feat_imp    = pd.Series(best_lgb.feature_importances_, index=X_train.columns)
n_keep      = int(len(feat_imp) * 0.9)
top_feats   = feat_imp.nlargest(n_keep).index.tolist()

X_tr_pruned = X_train[top_feats]
X_te_pruned = X_test[top_feats]

stack_pr = StackingRegressor(
    estimators=[("lgb", best_lgb), ("rf", rf_base)],
    final_estimator=LGBMRegressor(n_estimators=50, random_state=42),
    cv=KFold(n_splits=5, shuffle=False),
    n_jobs=-1
)

stack_pr.fit(X_tr_pruned, y_train)
y_pred_pr = stack_pr.predict(X_te_pruned)

print("\n--- After Pruning ---")
print("Features retained:", len(top_feats))
print("RMSE:", mean_squared_error(y_test, y_pred_pr, squared=False))
print("MAE :", mean_absolute_error(y_test, y_pred_pr))
print("R²  :", r2_score(y_test, y_pred_pr))

# --- extra diagnostics ---
print("Best iteration:", best_lgb.best_iteration_)
print("Trees in booster:", best_lgb.booster_.num_trees())
print("Any non-zero importances?", np.any(best_lgb.feature_importances_ > 0))


KeyboardInterrupt: 