In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
import joblib
import matplotlib.pyplot as plt
import random
import matplotlib.pyplot as plt
import numpy as np
from sklearn.inspection import PartialDependenceDisplay
from matplotlib.ticker import MaxNLocator

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

data = pd.read_excel(r'Original_data')

feature_names = data.columns[2:17]

# data standardization
scaler = MinMaxScaler(feature_range=(-1, 1))
data.loc[:, data.columns[2:17]] = scaler.fit_transform(data.loc[:, data.columns[2:17]])
scaler_filename = 'scaler.pkl'
joblib.dump(scaler, scaler_filename)

# input and output
X = data.iloc[:, 2:17].values
y = data.iloc[:, 17].values

# data splitting based on Isotherm
unique_isotherms = data['Isotherm'].unique()
train_indices = []
test_indices = []

num_isotherms_test = int(0.1 * len(unique_isotherms))  # 10% for test-set

test_isotherms = np.random.choice(unique_isotherms, size=num_isotherms_test, replace=False)
remaining_isotherms = np.setdiff1d(unique_isotherms, test_isotherms)

for isotherm in unique_isotherms:
    indices = data[data['Isotherm'] == isotherm].index

    if isotherm in test_isotherms:
        test_indices.extend(indices)
    else:
        train_indices.extend(indices)

X_test = X[test_indices]
y_test = y[test_indices]
X_train = X[train_indices]
y_train = y[train_indices]

# model construction
model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, objective='reg:squarederror')

kf = KFold(n_splits=5, shuffle=True)
y_pred = cross_val_predict(model, X_train, y_train, cv=kf)

# model evaluation
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mae = mean_absolute_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print("XGBoost training results:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R2 Score: {r2:.4f}")

model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print("XGBoost testing results:")
print(f"RMSE: {rmse_test:.4f}")
print(f"MAE: {mae_test:.4f}")
print(f"R2 Score: {r2_test:.4f}")

# Save the model
model_filename = 'PWM_XGBoost.joblib'
joblib.dump(model, model_filename)

In [None]:
feature01_index = feature_names.get_loc("MW (g/mol)")
feature02_index = feature_names.get_loc("Average pore size (nm)")  # Modify feature names to plot two-factors PDP
comb_features = [(feature01_index, feature02_index)]
feature01_name = feature_names[feature01_index]
feature02_name = feature_names[feature02_index]

# Two-factors PDP plotting
plt.rcParams['font.family'] = 'Arial'

def plot_partial_dependence(X):
    pdp_display = PartialDependenceDisplay.from_estimator(
        estimator=model,
        X=X,
        features=comb_features,
        grid_resolution=500
    )

    grid_values = pdp_display.pd_results[0].average
    feature01_values = pdp_display.pd_results[0].grid_values[0]
    feature02_values = pdp_display.pd_results[0].grid_values[1]

    feature01_values, feature02_values = np.meshgrid(feature01_values, feature02_values)
    feature01_values = feature01_values.ravel()
    feature02_values = feature02_values.ravel()
    grid_values = grid_values.ravel()

    sc = plt.scatter(feature01_values, feature02_values, 
                     c=grid_values, cmap='viridis', s=0)

    cbar = plt.colorbar(sc, pad=0.15, fraction=0.05)
    cbar.ax.set_ylabel("Partial Dependence", fontsize=20, labelpad=7.5)
    cbar.ax.yaxis.set_label_position('left')
    cbar.ax.yaxis.set_ticks_position('right')
    cbar.ax.tick_params(labelsize=15)

    ax = plt.gca()
    ax.spines['top'].set_linewidth(1.5)
    ax.spines['right'].set_linewidth(1.5)
    ax.spines['left'].set_linewidth(1.5)
    ax.spines['bottom'].set_linewidth(1.5)
    ax.tick_params(axis='both', which='major', width=1.5, length=10)
    ax = plt.gca()
    ax.xaxis.set_major_locator(MaxNLocator(nbins=6))

    plt.xticks(fontsize=20, rotation=90)
    plt.yticks(fontsize=20)
    plt.xlabel(feature01_name, fontsize=20, labelpad=15)
    plt.ylabel(feature02_name, fontsize=20, labelpad=15)
    plt.legend(["Predicted logKd"], fontsize=12)
    plt.tight_layout()
    plt.show()

plot_partial_dependence(X_train)
