In [6]:
import re
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, ParameterGrid, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

RANDOM_STATE = 123
TEST_SIZE = 0.2
CV = 3


def _sanitize_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = df.columns.astype(str)
    cols = [re.sub(r"[\[\]<>]", "", c) for c in cols]
    cols = [re.sub(r"\s+", "_", c).strip("_") for c in cols]
    df = df.copy()
    df.columns = cols
    return df


def load_xy(file_path: str, x_cols=(0, 20), y_col=20):
    df = pd.read_excel(file_path)
    df = _sanitize_columns(df)
    X = df.iloc[:, x_cols[0] : x_cols[1]]
    y = df.iloc[:, y_col]
    return X, y


def _progress_marks(total: int, step_pct: int = 10):
    marks = set()
    for p in range(step_pct, 101, step_pct):
        k = int(np.ceil(total * p / 100))
        marks.add(max(1, k))
    return marks


def run_gridsearch(model, X_train, y_train, param_grid: dict, tag: str):
    grid = list(ParameterGrid(param_grid))
    total = len(grid)
    marks = _progress_marks(total, step_pct=10)

    best_score = -np.inf
    best_params = None

    for i, params in enumerate(grid, start=1):
        est = model.__class__(**{**model.get_params(), **params})
        score = cross_val_score(
            est, X_train, y_train,
            cv=CV, scoring="r2", n_jobs=1
        ).mean()

        if score > best_score:
            best_score = float(score)
            best_params = params

        if i in marks:
            pct = int(round(i / total * 100))
            print(f"[{tag}] {pct}% done ({i}/{total})")

    best_est = model.__class__(**{**model.get_params(), **best_params})
    best_est.fit(X_train, y_train)
    return best_est, best_params, float(best_score)

def evaluate(model, X_test, y_test):
    pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, pred) ** 0.5
    r2 = r2_score(y_test, pred)
    return float(rmse), float(r2)


def fit_single_stage(model, X_train, y_train, X_test, y_test, grid, tag: str):
    best_est, best_params, best_cv_r2 = run_gridsearch(model, X_train, y_train, grid, tag)
    rmse, r2 = evaluate(best_est, X_test, y_test)

    model_path = f"best_model_{tag}.pkl"
    joblib.dump(best_est, model_path)

    row = {
        "Model": tag,
        "Best_CV_R2": best_cv_r2,
        "Best_Params": str(best_params),
        "Test_RMSE": rmse,
        "Test_R2": r2,
        "Model_Path": model_path,
    }
    print(f"{tag}: Test_R2={r2:.4f}, Test_RMSE={rmse:.4f} | saved: {model_path}")
    return row


def main():
    file_path = "/home/Seungtae/Research/251216_HEAs_revision/HEAs-ML_training.xlsx"

    X, y = load_xy(file_path, x_cols=(0, 20), y_col=20)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )

    rf_grid = {
        "n_estimators": [100, 200, 300, 500, 700],
        "max_depth": [10, 20, 30, 50, 70, None],
        "min_samples_split": [2, 5, 10, 15],
        "min_samples_leaf": [1, 2, 4, 6, 8],
        "max_features": ["sqrt", "log2", 0.2, 0.5, 0.8],
        "bootstrap": [True, False],
    }

    gbr_grid = {
        "n_estimators": [100, 200, 300, 500],
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "max_depth": [3, 4, 5, 6],
        "min_samples_split": [2, 5, 10, 15],
        "min_samples_leaf": [1, 2, 4, 6],
        "subsample": [0.7, 0.8, 0.9, 1.0],
        "max_features": ["sqrt", "log2"],
    }

    xgb_grid = {
        "n_estimators": [100, 200, 300, 500],
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "max_depth": [3, 4, 5, 6, 8],
        "min_child_weight": [1, 3, 5, 7],
        "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
        "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
        "gamma": [0, 0.1, 0.2, 0.3],
        "reg_alpha": [0, 0.01, 0.1, 1],
        "reg_lambda": [1, 1.5, 2, 3],
    }

    rows = []
    rows.append(
        fit_single_stage(
            RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=1),
            X_train, y_train, X_test, y_test,
            rf_grid, "RF"
        )
    )
    rows.append(
        fit_single_stage(
            GradientBoostingRegressor(random_state=RANDOM_STATE),
            X_train, y_train, X_test, y_test,
            gbr_grid, "GBR"
        )
    )
    rows.append(
        fit_single_stage(
            XGBRegressor(objective="reg:squarederror", random_state=RANDOM_STATE, n_jobs=1),
            X_train, y_train, X_test, y_test,
            xgb_grid, "XGB"
        )
    )

    pd.DataFrame(rows).to_excel("gridsearch_summary_single_stage.xlsx", index=False)


if __name__ == "__main__":
    main()

[RF] 10% done (600/6000)
[RF] 20% done (1200/6000)
[RF] 30% done (1800/6000)
[RF] 40% done (2400/6000)
[RF] 50% done (3000/6000)
[RF] 60% done (3600/6000)
[RF] 70% done (4200/6000)
[RF] 80% done (4800/6000)
[RF] 90% done (5400/6000)
[RF] 100% done (6000/6000)
RF: Test_R2=0.7865, Test_RMSE=233.2481 | saved: best_model_RF.pkl
[GBR] 10% done (820/8192)
[GBR] 20% done (1639/8192)
[GBR] 30% done (2458/8192)
[GBR] 40% done (3277/8192)
[GBR] 50% done (4096/8192)
[GBR] 60% done (4916/8192)
[GBR] 70% done (5735/8192)
[GBR] 80% done (6554/8192)
[GBR] 90% done (7373/8192)
[GBR] 100% done (8192/8192)
GBR: Test_R2=0.8538, Test_RMSE=192.9876 | saved: best_model_GBR.pkl
[XGB] 10% done (51200/512000)
[XGB] 20% done (102400/512000)
[XGB] 30% done (153600/512000)
[XGB] 40% done (204800/512000)
[XGB] 50% done (256000/512000)
[XGB] 60% done (307200/512000)
[XGB] 70% done (358400/512000)
[XGB] 80% done (409600/512000)
[XGB] 90% done (460800/512000)
[XGB] 100% done (512000/512000)
XGB: Test_R2=0.8173, Test_

In [1]:
import re
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score

RANDOM_STATE = 123
TEST_SIZE = 0.2


def _sanitize_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = df.columns.astype(str)
    cols = [re.sub(r"[\[\]<>]", "", c) for c in cols]
    cols = [re.sub(r"\s+", "_", c).strip("_") for c in cols]
    out = df.copy()
    out.columns = cols
    return out


def load_xy(file_path: str, x_cols=(0, 20), y_col=20):
    df = pd.read_excel(file_path)
    df = _sanitize_columns(df)
    X = df.iloc[:, x_cols[0] : x_cols[1]]
    y = df.iloc[:, y_col]
    return X, y


def evaluate_metrics(model, X_test, y_test):
    pred = model.predict(X_test)
    rmse = float(np.sqrt(mean_squared_error(y_test, pred)))
    r2 = float(r2_score(y_test, pred))
    mape = float(mean_absolute_percentage_error(y_test, pred) * 100.0)
    return r2, rmse, mape


def main():
    data_path = "/home/Seungtae/Research/251216_HEAs_revision/HEAs-ML_training.xlsx"
    X, y = load_xy(data_path, x_cols=(0, 20), y_col=20)

    _, X_test, _, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )

    model_paths = {
        "RF": "best_model_RF.pkl",
        "GBR": "best_model_GBR.pkl",
        "XGB": "best_model_XGB.pkl",
    }

    rows = []
    for name, path in model_paths.items():
        model = joblib.load(path)
        r2, rmse, mape = evaluate_metrics(model, X_test, y_test)

        rows.append({
            "Model": name,
            "Test_R2": r2,
            "Test_RMSE": rmse,
            "Test_MAPE(%)": mape,
            "Model_Path": path,
        })

        print(f"{name}: Test_R2={r2:.4f}, Test_RMSE={rmse:.4f}, Test_MAPE={mape:.2f}%")

    pd.DataFrame(rows).to_excel("test_metrics_from_saved_models.xlsx", index=False)


if __name__ == "__main__":
    main()

RF: Test_R2=0.7865, Test_RMSE=233.2481, Test_MAPE=28.88%
GBR: Test_R2=0.8538, Test_RMSE=192.9876, Test_MAPE=23.62%
XGB: Test_R2=0.8173, Test_RMSE=215.7462, Test_MAPE=27.06%
