<a href="https://colab.research.google.com/github/MZiaAfzal71/Average_Weighted_Path_Vector/blob/main/Data%20Files/Models/XGB_CV_HyperParam_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/MZiaAfzal71/Average_Weighted_Path_Vector.git
%cd Average_Weighted_Path_Vector/Data\ Files

Cloning into 'Average_Weighted_Path_Vector'...
remote: Enumerating objects: 718, done.[K
remote: Counting objects: 100% (133/133), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 718 (delta 84), reused 8 (delta 8), pack-reused 585 (from 1)[K
Receiving objects: 100% (718/718), 30.79 MiB | 23.96 MiB/s, done.
Resolving deltas: 100% (238/238), done.
/content/Average_Weighted_Path_Vector/Data Files


In [2]:
!pip install osfclient
import shutil
from osfclient.api import OSF
from subprocess import run
import os

# Replace with your OSF project ID
project_id = "p5ga2"   # e.g. from https://osf.io/abcd3/
osf = OSF()
project = osf.project(project_id)
store = project.storage("osfstorage")

desc_folder = []
for fold in store.folders:
    if fold.path.strip("/") == "Descriptors Data":
        desc_folder.append(fold)
        break


# Download all files and keep folder structure
for folder in desc_folder:
  for f in folder.files:
      local_path = f.path.strip("/")            # keep folders
      local_dir = os.path.dirname(local_path)   # extract dir
      if local_dir and not os.path.exists(local_dir):
          os.makedirs(local_dir, exist_ok=True) # create dirs if missing
      with open(local_path, "wb") as out:
          f.write_to(out)
      if local_path.endswith(".zip"):
        command = f"unzip '{local_path}' -d '{local_dir}'"
        run(command, shell=True)
        print(f"\nUnzipped {local_path} -> {local_dir}")

Collecting osfclient
  Downloading osfclient-0.0.5-py2.py3-none-any.whl.metadata (5.5 kB)
Downloading osfclient-0.0.5-py2.py3-none-any.whl (39 kB)
Installing collected packages: osfclient
Successfully installed osfclient-0.0.5


100%|██████████| 23.8M/23.8M [00:00<00:00, 43.6Mbytes/s]



Unzipped Descriptors Data/Descriptors Data.zip -> Descriptors Data


In [3]:
# Requirements: xgboost, scikit-learn, numpy, pandas, joblib
import os
import random
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# --------------------------------------------------------------------
# Config
# --------------------------------------------------------------------
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# -------------------------
# User configuration
# -------------------------
DATA_DIR = Path("Descriptors Data")   # expects files like data/BP_MACCS.csv or data/BP_MACCS_training_log.csv? -> use descriptor CSVs
OUT_DIR = Path("xgb_cv_results")
OUT_DIR.mkdir(exist_ok=True)

# Filename convention: {property}_{descriptor}.csv
# CSV must contain feature columns + a column "target"
# If SMILES present, drop it before training
def load_xy(property_name, descriptor_name, data_dir=DATA_DIR, target_col="target"):
    fn = data_dir / f"{property_name}_{descriptor_name}.parquet"
    if not fn.exists():
        raise FileNotFoundError(f"Expected file: {fn}")
    df = pd.read_parquet(fn)
    # if target_col not in df.columns:
    #     raise ValueError(f"target column '{target_col}' not found in {fn}. Found columns: {df.columns.tolist()}")
    X = df.iloc[:, 9:].values   # descriptors columns starts from 10th column onward
    y = df.iloc[:, 5].values   # 6th column contains the target property
    return X, y, df

# Param space for random search (XGBoost constructor arguments)
param_dist_xgb = {
    "n_estimators": [100, 200, 400, 800],
    "max_depth": [3, 5, 7, 9],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_alpha": [0.0, 0.5, 1.0],
    "reg_lambda": [1.0, 3.0, 5.0],
    # you can add more if you like
}

def sample_params(param_dist):
    return {k: random.choice(v) for k, v in param_dist.items()}

# --------------------------------------------------------------------
# Inner CV evaluator that uses early stopping
# --------------------------------------------------------------------
def evaluate_params_with_inner_cv(X, y, params,
                                  inner_splits=3, random_state=RANDOM_STATE,
                                  early_stopping_rounds=30):
    """
    For one hyperparameter configuration, perform inner k-fold CV with early stopping.
    Returns the mean validation MAE across inner folds.
    """
    kf = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)
    maes = []

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), start=1):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        # scale features using scaler fit on train fold
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_val_s = scaler.transform(X_val)

        # ensure eval_metric is provided in constructor (avoid passing to fit)
        model = XGBRegressor(
            objective="reg:squarederror",
            n_jobs=-1,
            random_state=int(random_state + fold),
            eval_metric="mae",
            early_stopping_rounds=early_stopping_rounds,
            **params
        )

        # fit with early stopping monitoring the validation fold (MAE)
        model.fit(
            X_tr_s, y_tr,
            eval_set=[(X_val_s, y_val)],
            verbose=False
        )

        y_val_pred = model.predict(X_val_s)
        maes.append(mean_absolute_error(y_val, y_val_pred))

    return float(np.mean(maes)), float(np.std(maes))

# --------------------------------------------------------------------
# Outer loop: nested CV with random search + early stopping
# --------------------------------------------------------------------
def nested_cv_xgb_earlystop(X, y,
                            outer_splits=5, inner_splits=3,
                            n_iter_search=15, random_state=RANDOM_STATE,
                            out_prefix="exp"):
    """
    Runs nested CV:
      - outer KFold for evaluation
      - inner KFold for hyperparam search (random sampling) with early stopping
    Returns:
      - df_folds: per-fold metrics and params
      - summary: aggregated metrics
    """
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    results = []

    for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), start=1):
        print(f"\n--- Outer fold {fold_idx}/{outer_splits} ---")
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Random search over param_dist_xgb
        best_score = float("inf")
        best_params = None
        best_std = None

        for i in range(n_iter_search):
            candidate = sample_params(param_dist_xgb)
            mean_mae, std_mae = evaluate_params_with_inner_cv(
                X_train, y_train, params=candidate,
                inner_splits=inner_splits,
                random_state=random_state + i,
                early_stopping_rounds=30
            )
            # lower MAE is better
            if mean_mae < best_score:
                best_score = mean_mae
                best_params = candidate
                best_std = std_mae

            if (i+1) % max(1, n_iter_search//5) == 0:
                print(f"  Tried {i+1}/{n_iter_search} candidates, best MAE so far: {best_score:.4f}")

        print(f"> Best inner MAE (avg) for fold {fold_idx}: {best_score:.4f} ± {best_std:.4f}")
        print("  Best params:", best_params)

        # Retrain best on the entire outer training set, with a small held-out validation for early stopping
        X_tr_final, X_val_final, y_tr_final, y_val_final = train_test_split(
            X_train, y_train, test_size=0.15, random_state=random_state
        )

        scaler_final = StandardScaler()
        X_tr_f_s = scaler_final.fit_transform(X_tr_final)
        X_val_f_s = scaler_final.transform(X_val_final)
        X_test_s    = scaler_final.transform(X_test)

        best_model = XGBRegressor(
            objective="reg:squarederror",
            n_jobs=-1,
            random_state=int(random_state + fold_idx),
            eval_metric="mae",
            early_stopping_rounds=30,
            **best_params
        )

        best_model.fit(
            X_tr_f_s, y_tr_final,
            eval_set=[(X_val_f_s, y_val_final)],
            verbose=False
        )

        y_test_pred = best_model.predict(X_test_s)
        mae = mean_absolute_error(y_test, y_test_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        r2 = r2_score(y_test, y_test_pred)

        # feature importances (aligned with columns in X)
        try:
            fi = best_model.feature_importances_.tolist()
        except Exception:
            fi = None

        # Save scaler + model together
        joblib.dump({"scaler": scaler_final, "model": best_model},
                    OUT_DIR / f"{out_prefix}_fold{fold_idx}_pipeline.joblib")

        results.append({
            "fold": fold_idx,
            "mae": float(mae),
            "rmse": float(rmse),
            "r2": float(r2),
            "best_inner_mae": float(best_score),
            "best_inner_mae_std": float(best_std),
            "best_params": best_params,
            "feature_importances": fi
        })

    df_res = pd.DataFrame(results)
    summary = {
        "mae_mean": df_res["mae"].mean(),
        "mae_std": df_res["mae"].std(),
        "rmse_mean": df_res["rmse"].mean(),
        "r2_mean": df_res["r2"].mean()
    }
    return df_res, summary



# -------------------------
# Sweep over all property/descriptor combos
# -------------------------

# properties = ["Log VP", "MP", "BP", "LogBCF", "LogS", "LogP"]
properties = ["LogP"]
descriptors = ["MACCS", "Morgan", "pwav"]
# descriptors = ["MACCS"]

all_experiments = []
for prop in properties:
    for desc in descriptors:
        print(f">>> Running nested CV for {prop} | {desc}")
        try:
            X, y, raw_df = load_xy(prop, desc)
        except Exception as e:
            print(f"Skipping {prop}_{desc} because: {e}")
            continue

        out_prefix = f"{prop}_{desc}"
        per_fold_df, per_exp_summary = nested_cv_xgb_earlystop(X, y,
                            outer_splits=5, inner_splits=3,
                            n_iter_search=50, random_state=RANDOM_STATE,
                            out_prefix=out_prefix)

        # save outputs
        per_fold_df.to_csv(OUT_DIR / f"{out_prefix}_cv_folds.csv", index=False)
        with open(OUT_DIR / f"{out_prefix}_summary.json", "w") as fh:
            json.dump(per_exp_summary, fh, indent=2)

        # append a flat record for later aggregation
        all_experiments.append({
            "Property": prop,
            "Descriptor": desc,
            "mae_mean": per_exp_summary["mae_mean"],
            "mae_std": per_exp_summary["mae_std"],
            "rmse_mean": per_exp_summary["rmse_mean"],
            "r2_mean": per_exp_summary["r2_mean"]
        })

# Save global table
pd.DataFrame(all_experiments).to_csv(OUT_DIR / "all_xgb_results_summary.csv", index=False)
print("All experiments finished. Results saved to", OUT_DIR)


>>> Running nested CV for LogP | MACCS

--- Outer fold 1/5 ---
  Tried 10/50 candidates, best MAE so far: 0.4899
  Tried 20/50 candidates, best MAE so far: 0.4861
  Tried 30/50 candidates, best MAE so far: 0.4861
  Tried 40/50 candidates, best MAE so far: 0.4861
  Tried 50/50 candidates, best MAE so far: 0.4861
> Best inner MAE (avg) for fold 1: 0.4861 ± 0.0072
  Best params: {'n_estimators': 800, 'max_depth': 9, 'learning_rate': 0.03, 'subsample': 0.6, 'colsample_bytree': 0.8, 'reg_alpha': 0.5, 'reg_lambda': 1.0}

--- Outer fold 2/5 ---
  Tried 10/50 candidates, best MAE so far: 0.4945
  Tried 20/50 candidates, best MAE so far: 0.4833
  Tried 30/50 candidates, best MAE so far: 0.4833
  Tried 40/50 candidates, best MAE so far: 0.4833
  Tried 50/50 candidates, best MAE so far: 0.4833
> Best inner MAE (avg) for fold 2: 0.4833 ± 0.0008
  Best params: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 0.5, 'reg_lambda': 3.0}


### Still in Progress!