In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
import joblib

# Path to CSV
csv_path = "/content/sample_data/goalkeepers_with_saves_and_conceded.csv"

# Load data
df = pd.read_csv(csv_path)

# -----------------------------
# Preprocessing
# -----------------------------

# Drop rows with missing target values
df = df.dropna(subset=["market_value_in_eur"])

# Categorical columns to encode
categorical_cols = ["name", "country_of_citizenship", "position", "foot",
                    "current_club_domestic_competition_id", "current_club_name"]

# Encode categorical features with LabelEncoder
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str)  # Ensure string
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Features and Target
X = df.drop(columns=["market_value_in_eur"])
y = df["market_value_in_eur"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# Model Training (LightGBM)
# -----------------------------
model = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42
)

model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# -----------------------------
# Evaluation
# -----------------------------
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.3f}")

# -----------------------------
# Save Model
# -----------------------------
joblib.dump(model, "/content/goalkeeper_market_value_model.pkl")
print("✅ Model saved at /content/goalkeeper_market_value_model.pkl")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 607
[LightGBM] [Info] Number of data points in the train set: 447, number of used features: 11
[LightGBM] [Info] Start training from score 3105257.270694
Model Performance:
RMSE: 5536620.04
MAE: 2995298.86
R²: 0.538
✅ Model saved at /content/goalkeeper_market_value_model.pkl


In [None]:
!pip install --quiet lightgbm xgboost catboost scikit-learn pandas numpy joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import joblib
import warnings
warnings.filterwarnings("ignore")

# --- Load data ---
csv_path = "/content/goalkeepers_with_saves_and_conceded.csv"
df = pd.read_csv(csv_path)

# --- Basic cleaning and drop rows without target ---
df = df.copy()
df = df.dropna(subset=["market_value_in_eur"])
df.reset_index(drop=True, inplace=True)

# --- Feature engineering ---
eps = 1e-9
# minutes per 90
df["matches_equivalent"] = df["minutes_played"].fillna(0) / 90.0
df["matches_equivalent"] = df["matches_equivalent"].replace(0, np.nan)  # will handle with imputer

# per-90 stats
df["saves_per_90"] = df["saves"] / (df["minutes_played"] / 90.0 + eps)
df["goals_conceded_per_90"] = df["goals_conceded"] / (df["minutes_played"] / 90.0 + eps)
df["cards_per_90"] = (df["yellow_cards"].fillna(0) + df["red_cards"].fillna(0)) / (df["minutes_played"] / 90.0 + eps)

# save rate
df["save_rate"] = df["saves"] / (df["saves"].fillna(0) + df["goals_conceded"].fillna(0) + eps)

# simple interactions
df["age_squared"] = df["age"].fillna(df["age"].median()) ** 2
df["height_age"] = df["height_in_cm"].fillna(df["height_in_cm"].median()) * df["age"].fillna(df["age"].median())

# Frequency encoding for high-cardinality categorical columns (club/country)
for col in ["current_club_name", "country_of_citizenship", "name"]:
    if col in df.columns:
        freq = df[col].fillna("NA").map(df[col].fillna("NA").value_counts(normalize=True))
        df[f"{col}_freq"] = freq

# Small-cardinal categories - encode with Ordinal later
small_cat_cols = [c for c in ["position", "foot"] if c in df.columns]
# numeric candidates
numeric_cols = [
    "height_in_cm", "minutes_played", "age", "saves", "goals_conceded",
    "yellow_cards", "red_cards", "saves_per_90", "goals_conceded_per_90",
    "cards_per_90", "save_rate", "age_squared", "height_age"
]
# include frequency encoded features
freq_cols = [c for c in df.columns if c.endswith("_freq")]
numeric_cols = [c for c in numeric_cols if c in df.columns] + freq_cols

# Final feature list
FEATURES = numeric_cols + small_cat_cols

# --- Prepare X, y (with log1p target transform) ---
X = df[FEATURES].copy()
y = df["market_value_in_eur"].astype(float).copy()
y_log = np.log1p(y)   # target transformed

# Train / holdout split
X_train, X_test, y_train, y_test, y_train_log, y_test_log = train_test_split(
    X, y, y_log, test_size=0.20, random_state=42
)

# --- Preprocessing pipelines ---
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="NA")),
    ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, [c for c in numeric_cols if c in X.columns]),
        ("cat", categorical_transformer, small_cat_cols),
    ],
    remainder="drop"
)

# Helper to create full pipeline per model
def make_pipeline(model):
    return Pipeline([("preproc", preprocessor), ("model", model)])

# --- Models to try (reasonable default hyperparams) ---
lgb = LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=-1, random_state=42)
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, tree_method="hist", random_state=42, verbosity=0)
cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, verbose=0, random_state=42)
rf = RandomForestRegressor(n_estimators=500, max_depth=12, random_state=42, n_jobs=-1)

models = {
    "LightGBM": lgb,
    "XGBoost": xgb,
    "CatBoost": cat,
    "RandomForest": rf
}

# --- Evaluation helpers (we perform CV on the TRAINING SET) ---
def eval_cv_on_train(pipeline, X_tr, y_tr_original, y_tr_log, cv=5):
    """
    Performs cross_val_predict on the log-transformed target,
    then inverts to original scale and returns RMSE/MAE/R2.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    # cross_val_predict gives out-of-fold predictions (on log scale)
    y_pred_log = cross_val_predict(pipeline, X_tr, y_tr_log, cv=kf, n_jobs=-1, method="predict")
    y_pred = np.expm1(y_pred_log)    # back to euros
    rmse = np.sqrt(mean_squared_error(y_tr_original, y_pred))
    mae = mean_absolute_error(y_tr_original, y_pred)
    r2 = r2_score(y_tr_original, y_pred)
    return {"RMSE": rmse, "MAE": mae, "R2": r2}

# Evaluate each model with CV on training set
print("=== Cross-validated performance (on TRAIN set) ===")
cv_results = {}
for name, model in models.items():
    print(f"\n-> {name} CV ...")
    pipe = make_pipeline(model)
    res = eval_cv_on_train(pipe, X_train, y_train, y_train_log, cv=5)
    cv_results[name] = res
    print(f"RMSE: {res['RMSE']:.2f} | MAE: {res['MAE']:.2f} | R2: {res['R2']:.3f}")

# --- Fit models on full training set and evaluate on holdout test set ---
print("\n=== Holdout performance (trained on full TRAIN, tested on TEST) ===")
holdout_results = {}
fitted_models = {}
for name, model in models.items():
    print(f"\n-> Training {name} on full TRAIN ...")
    pipe = make_pipeline(model)
    # fit on log-transformed target
    pipe.fit(X_train, y_train_log)
    fitted_models[name] = pipe
    # predict log then invert
    y_pred_log = pipe.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    holdout_results[name] = {"RMSE": rmse, "MAE": mae, "R2": r2}
    print(f"Holdout -> RMSE: {rmse:.2f} | MAE: {mae:.2f} | R2: {r2:.3f}")

# --- Build a stacking ensemble from the top 3 models (choose best by CV RMSE) ---
# pick top 3 by CV RMSE
top_by_cv = sorted(cv_results.items(), key=lambda x: x[1]["RMSE"])[:3]
top_models = []
for name, _ in top_by_cv:
    # give sklearn estimator (unfitted) to StackingRegressor using same preprocessing pipeline
    top_models.append((name, models[name]))

print("\n=== Stacking ensemble (top 3 from CV) ===")
stack = StackingRegressor(
    estimators=top_models,
    final_estimator=LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=42),
    n_jobs=-1,
    passthrough=False
)
stack_pipe = Pipeline([("preproc", preprocessor), ("stack", stack)])

# CV evaluation of stacking on TRAIN
stack_cv = eval_cv_on_train(stack_pipe, X_train, y_train, y_train_log, cv=5)
print(f"Stack CV -> RMSE: {stack_cv['RMSE']:.2f} | MAE: {stack_cv['MAE']:.2f} | R2: {stack_cv['R2']:.3f}")

# Fit stacking on full TRAIN and evaluate holdout
stack_pipe.fit(X_train, y_train_log)
y_pred_log = stack_pipe.predict(X_test)
y_pred = np.expm1(y_pred_log)
stack_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
stack_mae = mean_absolute_error(y_test, y_pred)
stack_r2 = r2_score(y_test, y_pred)
print(f"Stack Holdout -> RMSE: {stack_rmse:.2f} | MAE: {stack_mae:.2f} | R2: {stack_r2:.3f}")

# Save the final stacking model
joblib.dump(stack_pipe, "/content/goalkeeper_market_value_stack.pkl")
print("\n✅ Final stacked model saved to /content/goalkeeper_market_value_stack.pkl")

=== Cross-validated performance (on TRAIN set) ===

-> LightGBM CV ...
RMSE: 5906760.28 | MAE: 2512756.82 | R2: 0.161

-> XGBoost CV ...
RMSE: 5954560.10 | MAE: 2504153.78 | R2: 0.148

-> CatBoost CV ...
RMSE: 5651085.50 | MAE: 2337418.16 | R2: 0.232

-> RandomForest CV ...
RMSE: 5318142.48 | MAE: 2262645.99 | R2: 0.320

=== Holdout performance (trained on full TRAIN, tested on TEST) ===

-> Training LightGBM on full TRAIN ...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 969
[LightGBM] [Info] Number of data points in the train set: 447, number of used features: 15
[LightGBM] [Info] Start training from score 13.622088
Holdout -> RMSE: 6372165.98 | MAE: 2821530.00 | R2: 0.389

-> Training XGBoost on full TRAIN ...
Holdout -> RMSE: 7330815.61 | MAE: 3113401.38 | R2: 0.191

-> Training CatBoost on full TRAIN ...
Holdout -> RMSE: 7224001.96 | MAE: