In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from matplotlib.gridspec import GridSpec
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.ticker as mticker
import matplotlib.colors as mcolors
import shap
from sklearn.model_selection import KFold, cross_val_score
from sklearn.utils import shuffle
import joblib
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import xgboost as xgb

In [None]:
try:
    # For Colab
    from google.colab import files
    uploaded = files.upload()
    df = pd.read_csv('Composition_dataset_wt%.csv')
except:
    # For local execution
    df = pd.read_csv('../data/training/Composition_dataset_wt%.csv')

print(f"Loaded {len(df)} samples with {len(df.columns)} features")

In [None]:
# ========================================
# CONFIGURATION
# ========================================
RUN_OPTIMIZATION = False  # True: run hyperparameter tuning | False: use pre-optimized parameters
USE_EXTERNAL_VALIDATION = True  # True: run external validation | False: skip validation
SAVE_MODEL = False  # True: save model to .pkl file | False: skip saving

# Optuna optimization settings (use None to skip parameter)
# Example:
OPTUNA_CONFIG = {
    'RFR': {'n_trials': 100, 'timeout': None},   # Only 100 trials
    'ETR': {'n_trials': None, 'timeout': 600},   # Only 10 min timeout
    'GBR': {'n_trials': 150, 'timeout': 900},    # Both: 150 trials or 15 min
    'XGB': {'n_trials': 150, 'timeout': 900}     # Both: 150 trials or 15 min
}

In [None]:
if RUN_OPTIMIZATION:
    # For Google Colab: uncomment the line below to install Optuna
    !pip install -q optuna

    # For local use: install via requirements.txt
    # pip install -r requirements.txt

    import optuna
    import  optuna.visualization as vis

In [None]:
if USE_EXTERNAL_VALIDATION:
    # Configuration: Set target property and feature type
    property_name = "rho"  # "Hc", "Js", or "rho"
    featurization_method = "Composition"  # "Composition", "WenAlloys", or "CBFV"
    output_file = f"predictions_{property_name}_{featurization_method}.xlsx"
    column_name = f"{property_name}_predict"

    # Load experimental dataset
    try:
        # Colab: upload file
        uploaded = files.upload()
        X_predict = pd.read_csv('Dataset_experimental_wt%.csv')
    except:
        # Local: load from directory
        X_predict = pd.read_csv('../data/external_validation/Dataset_experimental_wt%.csv')

    print(f"Loaded {len(X_predict)} experimental samples with {len(X_predict.columns)} features")
    display(X_predict.head())

    # Initialize prediction results table (5 alloys × 2 models)
    results = pd.DataFrame({
        "Model": ["ETR"]*5 + ["XGB"]*5,
        "Alloy_number": list(range(1, 6)) + list(range(1, 6)),
        column_name: [np.nan]*10
    })

    print(f"\nPrediction table initialized for {property_name} ({featurization_method})")
    print(results, "\n")

In [None]:
# Exploratory Data Analysis
df.describe()

In [None]:
# Data Preprocessing
df_drop = df.dropna(subset=['Resistivity'])  # Remove rows with missing target values
df_drop = df_drop.reset_index(drop=True)     # Reset index after dropping

In [None]:
# Feature and Target Extraction
features = df_drop.iloc[:, 0:3]              # Si, Al, Fe composition (wt.%)
target= df_drop.loc[:, 'Resistivity']      # Target: Resistivity (μΩ⋅cm)

In [None]:
# Data Shuffling and Splitting
features, target = shuffle(features, target, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

# Optuna Hyperparameters Optimization

In [None]:
# Initialize an empty DataFrame to store all results
model_results = pd.DataFrame(columns=['Model', 'MAE', 'MSE', 'RMSE', 'R²'])

# Random Forest

In [None]:
def objective(trial):
    """Optuna objective for Random Forest hyperparameter tuning."""

    # Hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 500),
        'max_depth': trial.suggest_int('max_depth', 9, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 5),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 2),
        'criterion': trial.suggest_categorical('criterion', ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']),
    }

    # Evaluate with 5-fold CV
    model = RandomForestRegressor(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()

    return score

In [None]:
if RUN_OPTIMIZATION:
    print("Running hyperparameter optimization...")
    study = optuna.create_study(direction='maximize')

    # Extract config and filter out None values
    config = OPTUNA_CONFIG['RFR']
    optimize_kwargs = {k: v for k, v in config.items() if v is not None}

    study.optimize(objective, **optimize_kwargs)

    # Display results
    print('\nBest trial results:')
    trial = study.best_trial
    print(f'R2 score: {trial.value:.4f}\n')

    print('Best parameters:')
    print('best_params = {')
    for i, (key, value) in enumerate(trial.params.items()):
        if isinstance(value, float):
            print(f"    '{key}': {value},")
        elif isinstance(value, str):
            print(f"    '{key}': '{value}',")
        else:
            print(f"    '{key}': {value},")
    print('}\n')

    best_params = study.best_params

    # Display Optuna optimization visualizations
    vis.plot_optimization_history(study).show()
    vis.plot_param_importances(study).show()
    vis.plot_parallel_coordinate(study).show()

else:
    # Use pre-optimized parameters
    print("Using pre-optimized hyperparameters...")
    best_params = {
        'n_estimators': 460,
        'max_depth': 10,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'criterion': 'absolute_error'
    }
    print(f"Parameters: {best_params}\n")

# Train model with best parameters
best_model = RandomForestRegressor(random_state=42, **best_params)
best_model.fit(X_train, y_train)

In [None]:
if SAVE_MODEL:
    # Save trained model
    joblib.dump(best_model, '1_composition_Resistivity_rfr.pkl')
    print("✅ Model saved as '1_composition_Resistivity_rfr.pkl'")

    try:
        files.download('1_composition_Resistivity_rfr.pkl')
    except:
        pass

In [None]:
# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
test_r2 = r2_score(y_test, y_pred)
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

# Display metrics in formatted table
print("\nTest Set Evaluation Metrics:")
print("----------------------------------------")
print(f"| {'Metric':<10} | {'Value':>15} |")
print("|------------|----------------|")
print(f"| R²         | {test_r2:>15.4f} |")
print(f"| MAE        | {test_mae:>15.4f} |")
print(f"| MSE        | {test_mse:>15.4f} |")
print(f"| RMSE       | {test_rmse:>15.4f} |")

In [None]:
def evaluate_and_store(model, model_name, X_test, y_test, results_df):
    """Evaluate model and store metrics in results DataFrame."""

    # Predict and calculate metrics
    y_pred = model.predict(X_test)

    metrics = {
        'Model': model_name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    }

    # Add to results
    results_df = pd.concat([results_df, pd.DataFrame([metrics])], ignore_index=True)
    return results_df


# Store RFR evaluation results
model_results = evaluate_and_store(best_model, 'RFR', X_test, y_test, model_results)

# Extra Trees Regressor

In [None]:
def objective(trial):
    """Optuna objective for Extra Trees hyperparameter tuning."""

    # Hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 8, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 8),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 2),
        'criterion': trial.suggest_categorical('criterion', ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']),
            }

    # Evaluate with 5-fold CV
    model = ExtraTreesRegressor(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
    return score

In [None]:
if RUN_OPTIMIZATION:
    print("Running hyperparameter optimization...")
    study = optuna.create_study(direction='maximize')

    # Extract config and filter out None values
    config = OPTUNA_CONFIG['ETR']
    optimize_kwargs = {k: v for k, v in config.items() if v is not None}

    study.optimize(objective, **optimize_kwargs)

    # Display results
    print('\nBest trial results:')
    trial = study.best_trial
    print(f'R2 score: {trial.value:.4f}\n')

    print('Best parameters:')
    print('best_params = {')
    for i, (key, value) in enumerate(trial.params.items()):
        if isinstance(value, float):
            print(f"    '{key}': {value},")
        elif isinstance(value, str):
            print(f"    '{key}': '{value}',")
        else:
            print(f"    '{key}': {value},")
    print('}\n')

    best_params = study.best_params

    # Display Optuna optimization visualizations
    vis.plot_optimization_history(study).show()
    vis.plot_param_importances(study).show()
    vis.plot_parallel_coordinate(study).show()

else:
    # Use pre-optimized parameters
    print("Using pre-optimized hyperparameters...")
    best_params = {
        'n_estimators': 304,
        'max_depth': 8,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'criterion': 'absolute_error'
    }
    print(f"Parameters: {best_params}\n")

# Train model with best parameters
best_model = ExtraTreesRegressor(random_state=42, **best_params)
best_model.fit(X_train, y_train)

In [None]:
if SAVE_MODEL:

# Save model to .pkl file
    joblib.dump(best_model, '2_composition_Resistivity_etr.pkl')

    print("✅ Model saved as '2_composition_Resistivity_etr.pkl'")
    try:
        files.download('2_composition_Resistivity_etr.pkl')
    except:
        pass

In [None]:
# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
test_r2 = r2_score(y_test, y_pred)
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

# Display metrics in formatted table
print("\nTest Set Evaluation Metrics:")
print("----------------------------------------")
print(f"| {'Metric':<10} | {'Value':>15} |")
print("|------------|----------------|")
print(f"| R²         | {test_r2:>15.4f} |")
print(f"| MAE        | {test_mae:>15.4f} |")
print(f"| MSE        | {test_mse:>15.4f} |")
print(f"| RMSE       | {test_rmse:>15.4f} |")
print("----------------------------------------")

In [None]:
if USE_EXTERNAL_VALIDATION:
    # Make predictions on external validation set (ETR)
    y_pred_etr = best_model.predict(X_predict)

    # Save ETR predictions to results DataFrame
    results.loc[results["Model"] == "ETR", column_name] = y_pred_etr
    print(f"ETR predictions saved: {y_pred_etr}")
    print(results)
    print("\n")

In [None]:
def evaluate_and_store(model, model_name, X_test, y_test, results_df):
    """Evaluate model and store metrics in results DataFrame."""

    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate regression metrics
    metrics = {
        'Model': model_name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    }

    # Append metrics to results DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([metrics])], ignore_index=True)
    return results_df


# Evaluate ETR model and store results
model_results = evaluate_and_store(best_model, 'ETR', X_test, y_test, model_results)

In [None]:
def shifted_coolwarm_cmap(p, n=256):
    """Shift coolwarm colormap so position p becomes neutral center."""
    base = plt.get_cmap("coolwarm")
    xs = np.linspace(0, 1, n)

    cols = []
    for x in xs:
        if x <= p:
            t = 0.5 * (x / p) if p > 0 else 0.0
        else:
            t = 0.5 + 0.5 * ((x - p) / (1 - p)) if p < 1 else 1.0
        cols.append(base(t))

    return mcolors.ListedColormap(cols, name="coolwarm_shifted")



def make_coolwarm_saturated_with_center(values, vcenter=0.0, nbins=6, steps=(1, 2, 2.5, 5, 10)):
    """Create norm, cmap, and ticks for SHAP with saturated colors and zero at neutral."""
    x = np.asarray(values, dtype=float)
    x = x[np.isfinite(x)]

    # Fallback for empty data
    if x.size == 0:
        norm = mcolors.Normalize(vmin=-1, vmax=1, clip=True)
        cmap = plt.get_cmap("coolwarm")
        ticks = np.array([-1, 0, 1], dtype=float)
        return norm, cmap, ticks

    data_min, data_max = float(np.min(x)), float(np.max(x))

    # Normalize by actual data range for saturated colors
    norm = mcolors.Normalize(vmin=data_min, vmax=data_max, clip=True)

    # Generate scientific ticks
    loc = mticker.MaxNLocator(nbins=nbins, steps=list(steps))
    ticks = loc.tick_values(data_min, data_max)
    ticks = ticks[(ticks >= norm.vmin) & (ticks <= norm.vmax)]

    # Shift colormap to place vcenter at neutral
    if data_min < vcenter < data_max:
        p = (vcenter - data_min) / (data_max - data_min)
        cmap = shifted_coolwarm_cmap(p, n=256)

        # Ensure vcenter is in ticks
        if not np.any(np.isclose(ticks, vcenter)):
            ticks = np.sort(np.append(ticks, vcenter))
    else:
        cmap = plt.get_cmap("coolwarm")

    return norm, cmap, ticks

In [None]:
# ============ CALCULATE SHAP VALUES ============
explainer = shap.TreeExplainer(best_model, X_train)
shap_values = explainer(X_train)

# Create Explanation object (optional)
explanation = shap.Explanation(
    values=shap_values.values,
    base_values=explainer.expected_value,
    data=X_train.values,
    feature_names=X_train.columns.tolist()
)

# ============ DEFINE FEATURE ORDER ============
desired_order = ['Fe', 'Si', 'Al']

# ============ BUILD GRID-SHAP PLOTS ============
plt.style.use('default')
fig = plt.figure(figsize=(3.5, 10))
gs = GridSpec(3, 1, figure=fig, hspace=0.3)

for i, feature_name in enumerate(desired_order):
    # Get feature index
    try:
        feature_index = X_train.columns.get_loc(feature_name)
    except KeyError:
        print(f"Error: Feature '{feature_name}' not found in X_train.")
        continue

    ax = fig.add_subplot(gs[i, 0])
    ax.set_facecolor('#f0f0f0')

    # Extract SHAP values for current feature
    shap_col = shap_values[:, feature_index].values

    # Get normalized colormap with zero at neutral
    norm, cmap_local, ticks = make_coolwarm_saturated_with_center(
        shap_col,
        vcenter=0.0,
        nbins=6
    )

    # Scatter plot
    sc = ax.scatter(
        X_train[feature_name],
        y_train,
        c=shap_col,
        cmap=cmap_local,
        norm=norm,
        alpha=0.9,
        s=50,
        edgecolor='black',
        linewidth=0.3,
        marker='o'
    )

    # Add colorbar
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('right', size='4.5%', pad=0.05)
    cbar = fig.colorbar(sc, cax=cax)

    # Set scientific ticks
    cbar.set_ticks(ticks)
    cbar.formatter = mticker.ScalarFormatter(useMathText=False)
    cbar.formatter.set_powerlimits((-100, 100))
    cbar.update_ticks()
    cbar.ax.tick_params(labelsize=11)
    cbar.set_label('SHAP Value', fontsize=12, labelpad=5)

    # Configure axes
    ax.yaxis.set_major_locator(mticker.MaxNLocator(nbins=5, integer=True))
    ax.xaxis.set_major_locator(mticker.MaxNLocator(min_n_ticks=3, nbins=4, integer=True))
    ax.set_xlabel(feature_name, fontsize=12, labelpad=4)
    ax.set_ylabel("Resistivity (μΩ⋅cm)", fontsize=12, labelpad=4)
    ax.tick_params(axis='both', labelsize=11, pad=4)
    ax.grid(True, linestyle=':', alpha=0.35, linewidth=0.45)

# Save figure
plt.savefig('Composition_Grid-SHAP_rho_ETR.png',
            dpi=600,
            bbox_inches='tight',
            facecolor='white')
plt.show()

# Download in Colab (auto-skip locally)
try:
    files.download('Composition_Grid-SHAP_rho_ETR.png')
except:
    pass

# Gradient Boost Regressor

In [None]:
def objective(trial):
    """Optuna objective for Gradient Boost hyperparameter tuning."""

    # Hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 420, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 8, 18),
        'min_samples_leaf': 2,
        'subsample': trial.suggest_float('subsample', 0.5, 0.6),
        'loss': 'absolute_error',
        'criterion': 'friedman_mse',
    }

    # Evaluate with 5-fold CV
    model = GradientBoostingRegressor(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train,
                          cv=5, scoring='r2').mean()

    return score

In [None]:
if RUN_OPTIMIZATION:
    print("Running hyperparameter optimization...")
    study = optuna.create_study(direction='maximize')

    # Extract config and filter out None values
    config = OPTUNA_CONFIG['GBR']
    optimize_kwargs = {k: v for k, v in config.items() if v is not None}

    study.optimize(objective, **optimize_kwargs)

    # Display results
    print('\nBest trial results:')
    trial = study.best_trial
    print(f'R2 score: {trial.value:.4f}\n')

    print('Best parameters:')
    print('best_params = {')
    for i, (key, value) in enumerate(trial.params.items()):
        if isinstance(value, float):
            print(f"    '{key}': {value},")
        elif isinstance(value, str):
            print(f"    '{key}': '{value}',")
        else:
            print(f"    '{key}': {value},")
    print('}\n')

    best_params = study.best_params

    # Display Optuna optimization visualizations
    vis.plot_optimization_history(study).show()
    vis.plot_param_importances(study).show()
    vis.plot_parallel_coordinate(study).show()

else:
    # Use pre-optimized parameters
    print("Using pre-optimized hyperparameters...")
    best_params = {
        'n_estimators': 442,
        'learning_rate': 0.04,
        'max_depth': 3,
        'min_samples_split': 17,
        'subsample': 0.8
    }
    print(f"Parameters: {best_params}\n")


# Train model with best parameters
best_model = GradientBoostingRegressor(random_state=42, **best_params)
best_model.fit(X_train, y_train)

In [None]:
if SAVE_MODEL:

# Save model to .pkl file
    joblib.dump(best_model, '3_composition_Resistivity_gbr.pkl')

    print("✅ Model saved as '3_composition_Resistivity_gbr.pkl'")
    try:
        files.download('3_composition_Resistivity_gbr.pkl')
    except:
        pass

In [None]:
# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
test_r2 = r2_score(y_test, y_pred)
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

# Display metrics in formatted table
print("\nTest Set Evaluation Metrics:")
print("----------------------------------------")
print(f"| {'Metric':<10} | {'Value':>15} |")
print("|------------|----------------|")
print(f"| R²         | {test_r2:>15.4f} |")
print(f"| MAE        | {test_mae:>15.4f} |")
print(f"| MSE        | {test_mse:>15.4f} |")
print(f"| RMSE       | {test_rmse:>15.4f} |")
print("----------------------------------------")

In [None]:
def evaluate_and_store(model, model_name, X_test, y_test, results_df):
    """Evaluate model and store metrics in results DataFrame."""

    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate regression metrics
    metrics = {
        'Model': model_name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    }

    # Append metrics to results DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([metrics])], ignore_index=True)
    return results_df


# Evaluate GBR model and store results
model_results = evaluate_and_store(best_model, 'GBR', X_test, y_test, model_results)

# XGBoost Regressor

In [None]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 400, 500),
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.2, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 0.7),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.9, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 2),
        'gamma': trial.suggest_float('gamma', 0, 0.3),
        'alpha': trial.suggest_float('alpha', 4, 6),
        'lambda': trial.suggest_float('lambda', 8.5, 10),
    }

    model = xgb.XGBRegressor(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
    return score

In [None]:
def objective(trial):
    """Optuna objective for Gradient Boost hyperparameter tuning."""

    # Hyperparameter search space
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 400, 500),
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.2, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 0.7),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.9, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 2),
        'gamma': trial.suggest_float('gamma', 0, 0.3),
        'alpha': trial.suggest_float('alpha', 4, 6),
        'lambda': trial.suggest_float('lambda', 8.5, 10),
    }

    # Evaluate with 5-fold CV
    model = xgb.XGBRegressor(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train,
                            cv=5, scoring='r2').mean()

    return score

In [None]:
if RUN_OPTIMIZATION:
    print("Running hyperparameter optimization...")
    study = optuna.create_study(direction='maximize')

    # Extract config and filter out None values
    config = OPTUNA_CONFIG['XGB']
    optimize_kwargs = {k: v for k, v in config.items() if v is not None}

    study.optimize(objective, **optimize_kwargs)

    # Display results
    print('\nBest trial results:')
    trial = study.best_trial
    print(f'R2 score: {trial.value:.4f}\n')

    print('Best parameters:')
    print('best_params = {')
    for i, (key, value) in enumerate(trial.params.items()):
        if isinstance(value, float):
            print(f"    '{key}': {value},")
        elif isinstance(value, str):
            print(f"    '{key}': '{value}',")
        else:
            print(f"    '{key}': {value},")
    print('}\n')

    best_params = study.best_params

    # Display Optuna optimization visualizations
    vis.plot_optimization_history(study).show()
    vis.plot_param_importances(study).show()
    vis.plot_parallel_coordinate(study).show()

else:
    # Use pre-optimized parameters
    print("Using pre-optimized hyperparameters...")
    best_params = {
        'objective': 'reg:squarederror',
        'n_estimators': 461,
        'max_depth': 6,
        'learning_rate': 0.2345269441539146,
        'subsample': 0.6524034514469835,
        'colsample_bytree': 0.9817378593570328,
        'min_child_weight': 2,
        'gamma': 0.2998781940594144,
        'alpha': 5.422366535485192,
        'reg_lambda': 9.3158705998517
    }
    print(f"Parameters: {best_params}\n")


# Train model with best parameters
mean_target = float(y_train.mean())
best_model = xgb.XGBRegressor(random_state=42, **best_params)
best_model.fit(X_train, y_train)

In [None]:
if SAVE_MODEL:

# Save model to .pkl file
    joblib.dump(best_model, '4_composition_Resistivity_xgb.pkl')

    print("✅ Model saved as '4_composition_Resistivity_xgb'")
    try:
        files.download('4_composition_Resistivity_xgb.pkl')
    except:
        pass

In [None]:
# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
test_r2 = r2_score(y_test, y_pred)
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

# Display metrics in formatted table
print("\nTest Set Evaluation Metrics:")
print("----------------------------------------")
print(f"| {'Metric':<10} | {'Value':>15} |")
print("|------------|----------------|")
print(f"| R²         | {test_r2:>15.4f} |")
print(f"| MAE        | {test_mae:>15.4f} |")
print(f"| MSE        | {test_mse:>15.4f} |")
print(f"| RMSE       | {test_rmse:>15.4f} |")
print("----------------------------------------")

In [None]:
if USE_EXTERNAL_VALIDATION:
    # Make predictions on external validation set (XGB)
    y_pred_xgb = best_model.predict(X_predict)

    # Save XGB predictions to results DataFrame
    results.loc[results["Model"] == "XGB", column_name] = y_pred_xgb
    print(f"XGB predictions saved: {y_pred_xgb}")
    print(results)
    print("\n")

    # Add experimental ground truth values
    y_true = [86.8, 106.9, 146.1, 122.8, 101.2]  # ← UPDATE WITH ACTUAL VALUES
    results["y_true"] = y_true * 2  # Duplicate for both models

    # Calculate errors
    results["error"] = results[column_name] - results["y_true"]
    results["abs_error"] = results["error"].abs()
    results["abs_percent_error_%"] = 100 * results["abs_error"] / results["y_true"]

    # Calculate average metrics per model
    for model in ["ETR", "XGB"]:
        mask = results["Model"] == model
        mae = results.loc[mask, "abs_error"].mean()
        mape = results.loc[mask, "abs_percent_error_%"].mean()
        print(f"{model}: MAE = {mae:.4f}, MAPE = {mape:.2f}%")

    # Save results to Excel
    results.to_excel(output_file, index=False, engine='openpyxl')
    print(f"✓ Results saved to {output_file}")

    # Download file (Colab only)
    try:
        files.download(output_file)
    except:
        pass

In [None]:
def evaluate_and_store(model, model_name, X_test, y_test, results_df):
    """
    Evaluate model and store results in the results DataFrame
    Returns updated DataFrame
    """
    y_pred = model.predict(X_test)

    metrics = {
        'Model': model_name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R²': r2_score(y_test, y_pred)
    }


    results_df = pd.concat([results_df, pd.DataFrame([metrics])], ignore_index=True)
    return results_df


model_results = evaluate_and_store(best_model, 'XGB', X_test, y_test, model_results)

In [None]:
def plot_train_test_relationship(model, X_train, y_train, X_test, y_test, model_name="Model"):
    """Plot actual vs predicted values for train and test sets."""

    # Initialize figure with publication quality settings
    plt.figure(figsize=(8, 6), dpi=300)

    # Generate predictions for both datasets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Plot training and test data with distinct colors
    plt.scatter(y_train, y_train_pred, alpha=0.8, label='Training', color='#2408b1', s=150)
    plt.scatter(y_test, y_test_pred, alpha=0.8, label='Test', color='#ca4016', s=150)

    # Add ideal prediction line (y = x)
    max_val = max(np.max(y_train), np.max(y_test), np.max(y_train_pred), np.max(y_test_pred))
    min_val = min(np.min(y_train), np.min(y_test), np.min(y_train_pred), np.min(y_test_pred))
    plt.plot([min_val, max_val], [min_val, max_val], '--', color='black',
             label='Perfect Prediction', linewidth=4)

    # Display R² scores as text annotations
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    plt.text(0.05, 0.9, f'Train $R^2$ = {train_r2:.3f}', transform=plt.gca().transAxes,
             color='#2408b1', fontsize=22)
    plt.text(0.05, 0.83, f'Test $R^2$ = {test_r2:.3f}', transform=plt.gca().transAxes,
             color='#ca4016', fontsize=22)

    # Set axis labels
    plt.xlabel('Actual Resistivity (μΩ⋅cm)', fontsize=22)
    plt.ylabel('Predicted Resistivity (μΩ⋅cm)', fontsize=22)

    # Configure legend
    plt.legend(fontsize=20, framealpha=1)

    # Add grid for better readability
    plt.grid(True, linestyle='--', alpha=0.5)

    # Increase tick label size
    plt.tick_params(axis='both', which='major', labelsize=20)

    # Optimize layout and display
    plt.tight_layout()
    plt.show()

In [None]:
plot_train_test_relationship(best_model, X_train, y_train, X_test, y_test, model_name="XGBoost")

In [None]:
# ============ FUNCTION 1: Shifted coolwarm ============
def shifted_coolwarm_cmap(p, n=256):
    """
    Shift coolwarm colormap so position p becomes neutral center.

    """
    base = plt.get_cmap("coolwarm")
    xs = np.linspace(0, 1, n)

    cols = []
    for x in xs:
        if x <= p:
            t = 0.5 * (x / p) if p > 0 else 0.0
        else:
            t = 0.5 + 0.5 * ((x - p) / (1 - p)) if p < 1 else 1.0
        cols.append(base(t))

    return mcolors.ListedColormap(cols, name="coolwarm_shifted")


# ============ FUNCTION 2: Normalization + ticks + colormap ============
def make_coolwarm_saturated_with_center(values, vcenter=0.0, nbins=6, steps=(1, 2, 2.5, 5, 10)):
    """
    Create norm, cmap, and ticks for SHAP visualization with saturated colors and zero at neutral.

    """
    x = np.asarray(values, dtype=float)
    x = x[np.isfinite(x)]

    # Fallback for empty data
    if x.size == 0:
        norm = mcolors.Normalize(vmin=-1, vmax=1, clip=True)
        cmap = plt.get_cmap("coolwarm")
        ticks = np.array([-1, 0, 1], dtype=float)
        return norm, cmap, ticks

    data_min, data_max = float(np.min(x)), float(np.max(x))

    # Normalize by actual data range for saturated colors
    norm = mcolors.Normalize(vmin=data_min, vmax=data_max, clip=True)

    # Generate scientific ticks
    loc = mticker.MaxNLocator(nbins=nbins, steps=list(steps))
    ticks = loc.tick_values(data_min, data_max)

    # Clip ticks to norm range
    ticks = ticks[(ticks >= norm.vmin) & (ticks <= norm.vmax)]

    # Shift colormap to place vcenter at neutral
    if data_min < vcenter < data_max:
        p = (vcenter - data_min) / (data_max - data_min)
        cmap = shifted_coolwarm_cmap(p, n=256)

        # Ensure vcenter is in ticks
        if not np.any(np.isclose(ticks, vcenter)):
            ticks = np.sort(np.append(ticks, vcenter))
    else:
        # Use standard coolwarm if vcenter is outside range
        cmap = plt.get_cmap("coolwarm")

    return norm, cmap, ticks

In [None]:
# ============ EXTRACT SHAP FROM XGBOOST (DMatrix) ============
# Create DMatrix with feature names
dmatrix_train = xgb.DMatrix(X_train, feature_names=X_train.columns.tolist())

# Get SHAP values via XGBoost native API
shap_values_raw = best_model.get_booster().predict(dmatrix_train, pred_contribs=True)
shap_values_matrix = shap_values_raw[:, :-1]  # Exclude base value column
expected_value = float(shap_values_raw[0, -1])  # Extract base value

# Create SHAP Explanation object
explanation = shap.Explanation(
    values=shap_values_matrix,
    base_values=np.full(len(X_train), expected_value),
    data=X_train.values,
    feature_names=X_train.columns.tolist()
)

print(f"Shape SHAP values: {shap_values_matrix.shape}")
print(f"Expected value: {expected_value}")

In [None]:
# ============ DEFINE FEATURE ORDER ============
desired_order = ['Fe', 'Si', 'Al']

# ============ BUILD GRID-SHAP PLOTS ============
plt.style.use('default')
fig = plt.figure(figsize=(3.5, 10))
gs = GridSpec(3, 1, figure=fig, hspace=0.3)

for i, feature_name in enumerate(desired_order):
    # Get feature index
    try:
        feature_index = X_train.columns.get_loc(feature_name)
    except KeyError:
        print(f"Error: Feature '{feature_name}' not found in X_train.")
        continue

    ax = fig.add_subplot(gs[i, 0])
    ax.set_facecolor('#f0f0f0')

    # Extract SHAP values for current feature
    shap_col = explanation[:, feature_index].values

    # Get normalized colormap with zero at neutral
    norm, cmap_local, ticks = make_coolwarm_saturated_with_center(
        shap_col,
        vcenter=0.0,
        nbins=6
    )

    # Scatter plot
    sc = ax.scatter(
        X_train[feature_name],
        y_train,
        c=shap_col,
        cmap=cmap_local,
        norm=norm,
        alpha=0.9,
        s=50,
        edgecolor='black',
        linewidth=0.3,
        marker='o'
    )

    # Add colorbar
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('right', size='4.5%', pad=0.05)
    cbar = fig.colorbar(sc, cax=cax)

    # Set scientific ticks
    cbar.set_ticks(ticks)
    cbar.formatter = mticker.ScalarFormatter(useMathText=False)
    cbar.formatter.set_powerlimits((-2, 3))
    cbar.update_ticks()
    cbar.ax.tick_params(labelsize=11)
    cbar.set_label('SHAP Value', fontsize=12, labelpad=5)

    # Configure axes
    ax.yaxis.set_major_locator(mticker.MaxNLocator(nbins=5, integer=True))
    ax.xaxis.set_major_locator(mticker.MaxNLocator(min_n_ticks=3, nbins=4, integer=True))
    ax.set_xlabel(feature_name, fontsize=12, labelpad=4)
    ax.set_ylabel("Saturation Polarization (T)", fontsize=12, labelpad=4)
    ax.tick_params(axis='both', labelsize=11, pad=4)
    ax.grid(True, linestyle=':', alpha=0.35, linewidth=0.45)

# Save figure
plt.savefig('Composition_Grid-SHAP_rho_XGB.png',
            dpi=600,
            bbox_inches='tight',
            facecolor='white')
plt.show()

# Download file (Colab only)
try:
    files.download('Composition_Grid-SHAP_rho_XGB.png')
except:
    pass

# **Results**

In [None]:
# After evaluating all models, add this to display and save results:
def display_model_results(results_df):
    """Display and save model comparison results"""
    # Format display
    pd.options.display.float_format = '{:.4f}'.format
    results_df.set_index('Model', inplace=True)

    print("\nFinal Model Comparison:")
    print("="*60)
    print(results_df)
    print("="*60)

    # Highlight best scores
    def highlight_metrics(s):
        metrics = {'MAE': 'min', 'MSE': 'min', 'RMSE': 'min', 'R²': 'max'}
        styles = []
        for v in s:
            if s.name in metrics:
                if metrics[s.name] == 'min' and v == s.min():
                    styles.append('background-color: yellow')
                elif metrics[s.name] == 'max' and v == s.max():
                    styles.append('background-color: lightgreen')
                else:
                    styles.append('')
            else:
                styles.append('')
        return styles

    styled_df = results_df.style.apply(highlight_metrics)
    display(styled_df)

    # Save to Excel
    results_df.to_excel('Composition_rho_metrics.xlsx')
    print("\nResults saved to 'Composition_rho_metrics.xlsx'")


# Call this after evaluating all models
display_model_results(model_results)

# Download file (Colab only)
try:
    files.download('Composition_rho_metrics.xlsx')
except:
    pass