# Import libraries

In [30]:
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error, roc_curve, auc
from sklearn.inspection import permutation_importance

import json

# Load data

In [34]:
csv_path = 'XGB_dataset/Dataset_5refs_11 metals_0.5_PBA_composition.csv'
PBA_com = pd.read_csv(csv_path, header=0, index_col=0)

Metal_list = ['Al', 'Sc', 'V', 'Cr', 'Mn', 'Fe(II)', 'Fe(III)', 'Co',
       'Cu', 'Zn', 'In', 'MA-Fe', 'MA-Co']
for metal in Metal_list:
    PBA_com = PBA_com.sort_values(by=metal, ascending=False)
    print(f"{metal}: mean: {PBA_com['c_2py'].head(33).describe()['mean']}, std: {PBA_com['c_2py'].head(33).describe()['std']}")


Al: mean: 0.3251750994545454, std: 0.05791431267802083
Sc: mean: 0.3215722927272727, std: 0.05447818763330244
V: mean: 0.3044110432424242, std: 0.041533513973753565
Cr: mean: 0.31996192803030304, std: 0.045890349403669324
Mn: mean: 0.315770957, std: 0.0658768230772799
Fe(II): mean: 0.3298596122424242, std: 0.04979768116971007
Fe(III): mean: 0.31777318072727273, std: 0.045980335724744845
Co: mean: 0.31601287966666664, std: 0.05008343835462589
Cu: mean: 0.23114596315151514, std: 0.05797845173199152
Zn: mean: 0.3326100976363636, std: 0.08241446304184648
In: mean: 0.3414226542727272, std: 0.0470397172255783
MA-Fe: mean: 0.2885520709393939, std: 0.05514751859026824
MA-Co: mean: 0.3388971347878788, std: 0.07129906907093884


In [37]:
#PBA_com.head()

Unnamed: 0,Al,Sc,V,Cr,Mn,Fe(II),Fe(III),Co,Cu,Zn,In,MA-Fe,MA-Co,c_2py,c_epox,c_uf1,c_uf3,c_uf4
161,0,0,0,50,0,0,0,0,0,50,0,0,100,0.40245,0.0,0.0,0.043919,0.0
192,50,0,0,0,0,50,0,0,0,0,0,0,100,0.370182,0.0,0.0,0.048867,0.0
182,0,50,0,0,0,50,0,0,0,0,0,0,100,0.37895,0.0,0.0,0.053144,0.0
137,0,0,0,0,0,0,0,0,100,0,0,0,100,0.194784,0.34572,0.0,0.053221,0.0
180,0,50,0,0,0,0,0,50,0,0,0,0,100,0.385745,0.0,0.0,0.063379,0.0


# Data spliting

In [38]:
# settings
target_col = None          # e.g., "label" or leave as None for plain KFold
n_splits = 5
shuffle = True
random_state = 42

df = pd.read_csv(csv_path)

rng = np.random.RandomState(random_state) if shuffle else None
idx = np.arange(len(df))
if shuffle:
    rng.shuffle(idx)

if target_col:
    # Stratified: split by class proportions
    df["_tmp_idx"] = np.arange(len(df))
    fold_col = np.full(len(df), -1, dtype=int)
    for cls, sub in df.groupby(target_col):
        sub_idx = sub.index.to_numpy()
        # shuffle within class
        if shuffle:
            rng.shuffle(sub_idx)
        # chunk into folds
        parts = np.array_split(sub_idx, n_splits)
        for fold_id, part in enumerate(parts):
            fold_col[part] = fold_id
    df["fold"] = fold_col
    df.drop(columns=["_tmp_idx"], inplace=True)
else:
    # Plain KFold by index
    parts = np.array_split(idx, n_splits)
    fold_col = np.full(len(df), -1, dtype=int)
    for fold_id, part in enumerate(parts):
        fold_col[part] = fold_id
    df["fold"] = fold_col

# Option 1: export five CSVs (train/test per fold)
for k in range(n_splits):
    test = df[df["fold"] == k].drop(columns=["fold"])
    train = df[df["fold"] != k].drop(columns=["fold"])
    test.to_csv(f"XGB_dataset/fold{k+1}_test.csv", index=False)
    train.to_csv(f"XGB_dataset/fold{k+1}_train.csv", index=False)



In [47]:
train_data, test_data = [], []
data_loc = 'XGB_dataset/'

for i in range(5):
    
    train_data.append(pd.read_csv(data_loc + f'fold{i+1}_train.csv'))
    test_data.append(pd.read_csv(data_loc + f'fold{i+1}_test.csv'))

# Feature selection by r2 or MAE

In [51]:
# ======= CONFIG =======
metric = "r2"       # options: "r2", "rmse", "mae"
random_state = 42

# ======= STORAGE =======
scores_by_feature_number = []
scores_by_feature_number_kfold = []
chosen_features = []

available_features = ['Al', 'Sc', 'V', 'Cr', 'Mn', 'Fe(II)', 'Fe(III)', 'Co',
       'Cu', 'Zn', 'In', 'MA-Fe', 'MA-Co']
number_of_features = len(available_features)

# ======= MAIN LOOP =======
for i in range(number_of_features):
    # For RMSE/MAE we minimize, for R² we maximize
    if metric == "r2":
        best_score = -np.inf
    else:
        best_score = np.inf

    best_feature = None
    best_scores_kfold = None

    for feature in available_features:
        features_to_use = chosen_features + [feature]
        fold_scores = []

        for k_fold in range(5):
            train = train_data[k_fold]
            test  = test_data[k_fold]

            X_train = train[features_to_use]
            X_test  = test[features_to_use]
            y_train = train['c_2py']
            y_test  = test['c_2py']

            model = XGBRegressor(
                random_state=random_state,
                learning_rate=0.1,
                n_estimators=300,
                max_depth=4,
                subsample=0.8,
                colsample_bytree=0.8,
                n_jobs=-1
            )
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            if metric == "r2":
                score = r2_score(y_test, y_pred)
            elif metric == "mae":
                score = mean_absolute_error(y_test, y_pred)
            else:
                raise ValueError("metric must be 'r2', or 'mae'")

            fold_scores.append(float(score))

        mean_score = np.mean(fold_scores)

        # pick better depending on metric type
        if metric == "r2":
            if mean_score > best_score:
                best_score = mean_score
                best_scores_kfold = fold_scores
                best_feature = feature
        else:  # RMSE or MAE -> lower is better
            if mean_score < best_score:
                best_score = mean_score
                best_scores_kfold = fold_scores
                best_feature = feature

    scores_by_feature_number.append(best_score)
    scores_by_feature_number_kfold.append(best_scores_kfold)
    chosen_features.append(best_feature)
    available_features.remove(best_feature)

    print(f"Feature {i+1:2d}: {best_feature}, {metric.upper()}: {best_score:.5f}")



Feature  1: Cu, R2: 0.30614
Feature  2: MA-Fe, R2: 0.34705
Feature  3: Co, R2: 0.47477
Feature  4: Zn, R2: 0.53383
Feature  5: In, R2: 0.57851
Feature  6: Mn, R2: 0.60712
Feature  7: MA-Co, R2: 0.61396
Feature  8: V, R2: 0.60739
Feature  9: Al, R2: 0.62787
Feature 10: Fe(II), R2: 0.61055
Feature 11: Cr, R2: 0.58479
Feature 12: Sc, R2: 0.57629
Feature 13: Fe(III), R2: 0.56518


# Feature selection by permutation importance

In [41]:
def _scoring_name(metric: str) -> str:
    if metric == "r2":
        return "r2"
    if metric == "mae":
        return "neg_mean_absolute_error"
    raise ValueError("metric must be 'r2', or 'mae'")

def permutation_importance_one_split(train, test, features, target="c_2py",
                                     metric="r2", n_repeats=10, random_state=42):
    X_train, y_train = train[features], train[target]
    X_test,  y_test  = test[features],  test[target]

    model = XGBRegressor(
        random_state=random_state, learning_rate=0.1, n_estimators=300,
        max_depth=4, subsample=0.8, colsample_bytree=0.8, n_jobs=-1, tree_method="hist"
    )
    model.fit(X_train, y_train)

    pi = permutation_importance(
        model, X_test, y_test,
        scoring=_scoring_name(metric),
        n_repeats=n_repeats, random_state=random_state, n_jobs=-1
    )

    df = pd.DataFrame({
        "feature": features,
        "importance_mean": pi.importances_mean,
        "importance_std":  pi.importances_std
    }).sort_values("importance_mean", ascending=False, ignore_index=True)

    # For RMSE/MAE (neg scorers), positive "score_drop" is easier to read:
    if metric in {"mae"}:
        df["score_drop"] = -df["importance_mean"]   # larger drop => more important

    return df, model


In [42]:
def permutation_importance_cv(train_folds, test_folds, features, target="c_2py",
                              metric="r2", n_repeats=10, random_state=42):
    per_fold = []
    for k in range(len(train_folds)):
        df_k, _ = permutation_importance_one_split(
            train_folds[k], test_folds[k], features, target, metric, n_repeats, random_state
        )
        df_k = df_k.set_index("feature")
        per_fold.append(df_k["importance_mean"])

    M = pd.concat(per_fold, axis=1)
    M.columns = [f"fold_{i}" for i in range(len(train_folds))]
    out = pd.DataFrame({
        "feature": M.index,
        "importance_mean_cv": M.mean(axis=1).values,
        "importance_std_cv":  M.std(axis=1, ddof=1).values
    }).sort_values("importance_mean_cv", ascending=False, ignore_index=True)

    if metric in {"rmse", "mae"}:
        out["score_drop_cv"] = -out["importance_mean_cv"]

    return out, M  # M has per-fold values if you want to inspect stability


In [43]:
pi_cv, per_fold = permutation_importance_cv(train_data, test_data, available_features, metric="r2", n_repeats=20)
print(pi_cv.head())

  feature  importance_mean_cv  importance_std_cv
0      Zn            0.473366           0.313057
1      In            0.368167           0.244932
2   MA-Co            0.269621           0.231345
3      Cr            0.239557           0.220335
4  Fe(II)            0.233154           0.215503


In [86]:
pi_cv

Unnamed: 0,feature,importance_mean_cv,importance_std_cv
0,Cu,1.027487,0.467454
1,MA-Fe,0.35067,0.195328
2,Zn,0.210288,0.121038
3,Co,0.173268,0.169127
4,Mn,0.13335,0.048106
5,In,0.064554,0.03966
6,MA-Co,0.019792,0.02604
7,Sc,0.019323,0.051264
8,V,0.015775,0.013145
9,Cr,0.006489,0.012159
