In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Project root
PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

# Custom functions for evaluation/models
from src.models import get_random_forest
from src.evaluation import plot_feature_importance

# Directories
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
RESULTS_DIR = PROJECT_ROOT / "results"
RESULTS_FIGURES = RESULTS_DIR / "figures"
RESULTS_METRICS = RESULTS_DIR / "metrics"

RESULTS_FIGURES.mkdir(parents=True, exist_ok=True)


In [2]:
# ===============================
# Load descriptors & assay labels
# ===============================
X_desc = pd.read_csv(DATA_PROCESSED / "tox21_descriptors.csv")
X_desc = X_desc.drop(columns=["smiles","mol_id"], errors="ignore")

# Replace infinite values with NaN and fill median
X_desc = X_desc.replace([np.inf, -np.inf], np.nan)
X_desc = X_desc.fillna(X_desc.median())

# Convert to NumPy for modeling
X = X_desc.values
feature_names = X_desc.columns.tolist()

# Load cleaned assay labels
tox21 = pd.read_csv(DATA_PROCESSED / "tox21_clean.csv")

ASSAY_COLUMNS = [
    'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase',
    'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
    'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
]

y = tox21[ASSAY_COLUMNS]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (3074, 217)
y shape: (3074, 12)


In [3]:
from sklearn.model_selection import train_test_split

# Dictionary to save trained models and feature importances
rf_models = {}
feature_importances = {}

for target in y.columns:
    print(f"Training model for {target} (for feature importances)")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y[target], test_size=0.2, stratify=y[target], random_state=42
    )

    model = get_random_forest()
    model.fit(X_train, y_train)

    # Save trained model and importances
    rf_models[target] = model
    feature_importances[target] = model.feature_importances_


Training model for NR-AR (for feature importances)
Training model for NR-AR-LBD (for feature importances)
Training model for NR-AhR (for feature importances)
Training model for NR-Aromatase (for feature importances)
Training model for NR-ER (for feature importances)
Training model for NR-ER-LBD (for feature importances)
Training model for NR-PPAR-gamma (for feature importances)
Training model for SR-ARE (for feature importances)
Training model for SR-ATAD5 (for feature importances)
Training model for SR-HSE (for feature importances)
Training model for SR-MMP (for feature importances)
Training model for SR-p53 (for feature importances)


In [4]:
TOP_K = 20

for target in y.columns:
    importances = feature_importances[target]
    sorted_idx = np.argsort(importances)[::-1][:TOP_K]

    plt.figure(figsize=(8,6))
    sns.barplot(
        x=importances[sorted_idx],
        y=np.array(feature_names)[sorted_idx],
        palette="viridis"
    )
    plt.title(f"Top {TOP_K} Feature Importances — {target}")
    plt.xlabel("Importance")
    plt.tight_layout()
    plt.savefig(RESULTS_FIGURES / f"feature_importance_{target}.png", dpi=300)
    plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. A

In [None]:
# Save all feature importances into a single CSV for analysis
fi_list = []

for target in y.columns:
    fi_list.extend([
        {"assay": target, "feature": fname, "importance": imp}
        for fname, imp in zip(feature_names, feature_importances[target])
    ])

fi_df = pd.DataFrame(fi_list)
fi_df.to_csv(RESULTS_METRICS / "feature_importances.csv", index=False)
fi_df.head()


In [6]:
# Example: identify the top features common across NR vs SR assays
NR_assays = [col for col in y.columns if col.startswith("NR")]
SR_assays = [col for col in y.columns if col.startswith("SR")]

top_features_NR = fi_df[fi_df.assay.isin(NR_assays)].groupby("feature")["importance"].mean().sort_values(ascending=False).head(10)
top_features_SR = fi_df[fi_df.assay.isin(SR_assays)].groupby("feature")["importance"].mean().sort_values(ascending=False).head(10)

print("Top 10 important features — Nuclear Receptor assays:\n", top_features_NR)
print("\nTop 10 important features — Stress Response assays:\n", top_features_SR)


Top 10 important features — Nuclear Receptor assays:
 feature
BCUT2D_MWLOW        0.019551
BCUT2D_CHGLO        0.015377
qed                 0.015186
BCUT2D_LOGPHI       0.013215
FpDensityMorgan3    0.012698
MinPartialCharge    0.012623
Chi4n               0.011757
BCUT2D_MRHI         0.011738
MaxEStateIndex      0.011398
PEOE_VSA7           0.011397
Name: importance, dtype: float64

Top 10 important features — Stress Response assays:
 feature
VSA_EState4         0.018908
BCUT2D_MRHI         0.014261
BCUT2D_MWHI         0.013846
MolLogP             0.013559
BCUT2D_LOGPHI       0.013356
PEOE_VSA7           0.013210
qed                 0.012938
Kappa3              0.012700
MinPartialCharge    0.012418
BCUT2D_CHGLO        0.012093
Name: importance, dtype: float64
