<a href="https://colab.research.google.com/github/MZiaAfzal71/Average_Weighted_Path_Vector/blob/main/Data%20Files/Data%20Statistics/XGB_SHAP_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/MZiaAfzal71/Average_Weighted_Path_Vector.git
%cd Average_Weighted_Path_Vector/Data\ Files

In [None]:
!pip install osfclient
import shutil
from osfclient.api import OSF
from subprocess import run
import os

# Replace with your OSF project ID
project_id = "p5ga2"   # e.g. from https://osf.io/abcd3/
osf = OSF()
project = osf.project(project_id)
store = project.storage("osfstorage")

desc_folder = []
for fold in store.folders:
    if fold.path.strip("/") == "Descriptors Data":
        desc_folder.append(fold)
        break


# Download all files and keep folder structure
for folder in desc_folder:
  for f in folder.files:
      local_path = f.path.strip("/")            # keep folders
      local_dir = os.path.dirname(local_path)   # extract dir
      if local_dir and not os.path.exists(local_dir):
          os.makedirs(local_dir, exist_ok=True) # create dirs if missing
      with open(local_path, "wb") as out:
          f.write_to(out)
      if local_path.endswith(".zip"):
        command = f"unzip '{local_path}' -d '{local_dir}'"
        run(command, shell=True)
        print(f"\nUnzipped {local_path} -> {local_dir}")

In [None]:
import os
import pandas as pd
import shap
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# === Paths & Settings ===
input_dir = "Descriptors Data"
output_dir = "XGBoost Results"
shap_dir = os.path.join(output_dir, "SHAP_Analysis")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(shap_dir, exist_ok=True)

property_sheets = ["Log VP", "MP", "BP", "LogBCF", "LogS", "LogP"]

# === Main Loop ===
for prop in property_sheets:
    # Input file
    input_file = os.path.join(input_dir, f"{prop}_pwav.parquet")
    print(f"\n🔹 Processing: {input_file}")

    # Read data
    df = pd.read_parquet(input_file)
    prop_pred = f"{prop}-Measured"

    # Feature matrix & target
    X = df.iloc[:, 9:]
    y = df[prop_pred]

    # Train/test split
    train_idx = df[df["Training/Test"] == "Training"].index.tolist()
    test_idx  = df[df["Training/Test"] == "Test"].index.tolist()

    X_train, X_valid = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[test_idx]

    # Define model
    model = XGBRegressor(random_state=42)
    model.fit(X_train, y_train)

    # === SHAP Analysis ===
    print("🔍 Running SHAP analysis...")
    explainer = shap.TreeExplainer(model)
    shap_values = explainer(X_valid)

    # Summary plot (bar chart: mean |SHAP| per feature)
    shap.summary_plot(shap_values, X_valid, plot_type="bar", show=False)
    plt.title(f"SHAP Feature Importance - {prop} pwav")
    plt.tight_layout()
    shap_file_bar = os.path.join(shap_dir, f"{prop}_pwav_shap_bar.png")
    plt.savefig(shap_file_bar, dpi=300)
    plt.close()

    # Summary plot (beeswarm: detailed per-sample effects)
    shap.summary_plot(shap_values, X_valid, show=False)
    plt.title(f"SHAP Beeswarm - {prop} pwav")
    plt.tight_layout()
    shap_file_bee = os.path.join(shap_dir, f"{prop}_pwav_shap_beeswarm.png")
    plt.savefig(shap_file_bee, dpi=300)
    plt.close()

    print(f"📈 SHAP plots saved:\n   ➤ {shap_file_bar}\n   ➤ {shap_file_bee}")

print("\n🎉 All files processed & SHAP analysis completed!")


In [None]:
import os
import pandas as pd
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from matplotlib.colors import PowerNorm

# === Paths ===
input_dir = "Descriptors Data"
output_dir = "XGBoost Results"
shap_dir = os.path.join(output_dir, "SHAP_Analysis")
os.makedirs(shap_dir, exist_ok=True)

property_sheets = ["Log VP", "MP", "BP", "LogBCF", "LogS", "LogP"]

# === Helper: compute SHAP importances for one property ===
def compute_shap_importances(prop, input_dir, topN=20):
    """Train XGBRegressor on PWAV descriptors for a property and return mean SHAP values + top features."""
    input_file = os.path.join(input_dir, f"{prop}_pwav.parquet")
    df = pd.read_parquet(input_file)

    X, y = df.iloc[:, 9:], df[f"{prop}-Measured"]

    # Train/test split
    train_idx = df[df["Training/Test"] == "Training"].index
    test_idx  = df[df["Training/Test"] == "Test"].index
    X_train, X_valid = X.loc[train_idx], X.loc[test_idx]
    y_train = y.loc[train_idx]

    # Train model
    model = XGBRegressor(random_state=42)
    model.fit(X_train, y_train)

    # SHAP values
    explainer = shap.TreeExplainer(model)
    shap_values = explainer(X_valid)

    # Mean |SHAP| per feature
    mean_shap = pd.Series(
        shap_values.abs.mean(0).values,
        index=X_valid.columns,
        name=prop
    )

    # Top-N features
    top_features = mean_shap.nlargest(topN)

    return mean_shap, top_features

# === Main analysis ===
all_shap_importances = {}
top_features_sets = {}
heatmap_data = {}

for prop in property_sheets:
    print(f"\n🔹 Processing {prop}...")
    mean_shap, top_features = compute_shap_importances(prop, input_dir, topN=20)
    all_shap_importances[prop] = mean_shap
    heatmap_data[prop] = top_features
    top_features_sets[prop] = set(top_features.index)

# === Save combined SHAP importances ===
shap_df = pd.concat(all_shap_importances, axis=1)
shap_df["Global_Avg"] = shap_df.mean(axis=1)

combined_file = os.path.join(shap_dir, "PWAV_SHAP_Combined.xlsx")
shap_df.to_excel(combined_file)
print(f"\n✅ Combined SHAP importance saved to: {combined_file}")

# === Pairwise Jaccard similarity of top-N sets ===
props = list(top_features_sets.keys())
jaccard_matrix = pd.DataFrame(index=props, columns=props, dtype=float)

for p1 in props:
    for p2 in props:
        inter = len({s.split("_", 1)[1] for s in top_features_sets[p1]} &
                    {s.split("_", 1)[1] for s in top_features_sets[p2]})
        union = len({s.split("_", 1)[1] for s in top_features_sets[p1]} |
                    {s.split("_", 1)[1] for s in top_features_sets[p2]})
        jaccard_matrix.loc[p1, p2] = inter / union

plt.figure(figsize=(8, 6))
sns.heatmap(jaccard_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Jaccard Similarity of Top-20 PWAV Features Across Properties")
plt.tight_layout()
plt.savefig(os.path.join(shap_dir, "PWAV_SHAP_Jaccard_Heatmap.png"), dpi=300)
plt.close()
print(f"📈 Jaccard heatmap saved: {os.path.join(shap_dir, 'PWAV_SHAP_Jaccard_Heatmap.png')}")

# === Annotated heatmaps (2 rows × 3 columns) ===
fig, axes = plt.subplots(2, 3, figsize=(22, 18), sharey=True)
axes = axes.flatten()

# Shared color scaling
all_vals = pd.concat(heatmap_data.values())
vmin, vmax = all_vals.min(), all_vals.max()

for ax, prop in zip(axes, property_sheets):
    values = heatmap_data[prop].to_frame()

    sns.heatmap(
        values, cmap="tab20", cbar=False,
        vmin=vmin, vmax=vmax,
        norm=PowerNorm(gamma=0.5),
        linewidths=0.3, ax=ax
    )

    # Annotate each cell with feature = value
    for y_idx, feature in enumerate(values.index):
        shap_val = values.loc[feature, prop]
        ax.text(
            0.5, y_idx + 0.5,
            f"{feature} = {shap_val:.2f}",
            ha="center", va="center",
            color="white", fontsize=12, fontweight="bold"
        )

    ax.set_title(prop, fontsize=16)
    ax.set_axis_off()

# Shared colorbar
cbar_ax = fig.add_axes([0.92, 0.25, 0.015, 0.5])
sm = plt.cm.ScalarMappable(cmap="tab20",
                           norm=PowerNorm(gamma=0.5, vmin=vmin, vmax=vmax))
fig.colorbar(sm, cax=cbar_ax, label="Mean |SHAP|")

plt.suptitle("Top-20 PWAV Features per Property (Annotated)", fontsize=22)
fig.subplots_adjust(left=0.05, right=0.9, top=0.94, bottom=0.1,
                    wspace=0.05, hspace=0.1)

heatmap_file = os.path.join(shap_dir, "PWAV_SHAP_Heatmaps_Annotated_2rows.png")
plt.savefig(heatmap_file, dpi=300)
plt.close()
print(f"✅ Annotated heatmaps saved: {heatmap_file}")

# === Contribution Breakdown ===
combined_file = os.path.join(shap_dir, "PWAV_SHAP_Combined.xlsx")
df_combined = pd.read_excel(combined_file)


# For each property, compute % contribution of top-64 vs others
contrib_data = {}
for prop in property_sheets:
    col = prop  # property column with SHAP importances
    list_of_prop_vals = df_combined[prop].dropna().index.tolist()

    top64_sum = df_combined.loc[list_of_prop_vals[:64], 'Global_Avg'].sum()
    rest_sum = df_combined.loc[list_of_prop_vals[64:], 'Global_Avg'].sum()

    total = top64_sum + rest_sum

    contrib_data[prop] = {
        "Top64": 100 * top64_sum / total,
        "Others": 100 * rest_sum / total
    }


df_contrib = pd.DataFrame(contrib_data).T

# --- Stacked bar plot ---
ax = df_contrib.plot(
    kind="bar", stacked=True, figsize=(10, 6),
    color=["#1f77b4", "#ff7f0e"], edgecolor="black"
)
plt.ylabel("Contribution (%)")
plt.title("PWAV SHAP Contributions: Top-64 vs Others")
plt.legend(title="Category")
# Add annotations inside bars
for c_idx, col in enumerate(df_contrib.columns):
    for i, val in enumerate(df_contrib[col]):
        if val > 1:  # skip if contribution is tiny
            # Compute bar position
            bottom = df_contrib.iloc[i, :c_idx].sum()
            ax.text(
                i, bottom + val/2, f"{val:.1f}%",
                ha="center", va="center", fontsize=10, color="white", fontweight="bold",
                rotation=0
            )
plt.xticks(rotation=0)
stacked_file = os.path.join(shap_dir, "PWAV_SHAP_StackedBar.png")
plt.savefig(stacked_file, dpi=300, bbox_inches="tight")
plt.close()
print(f"📈 SHAP Stacked Contributions: Top 64 vs Others is saved: {os.path.join(shap_dir, 'PWAV_SHAP_StackedBar.png')}")
