In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Load Ames dataset from Excel
file_path = 'Ames_Mutagenicity_sorted.xlsx'
data = pd.read_excel(file_path)
data.head()


Unnamed: 0,smiles_r,toxicity,source_rank,source,data,inchikey,inchikey14,protonated_smiles_r
0,c1ccc2c(c1)cc1ccc3cccc4ccc2c1c34,1,Ames_Derived,Ames Mutagenicity Dataset,Name: BENZO[A]PYRENE | CAS: 50-32-8 | Mutage...,FMMWHPNWAFZXNH-UHFFFAOYSA-N,FMMWHPNWAFZXNH,[H]c1c([H])c([H])c2c(c1[H])c([H])c1c([H])c([H]...
1,CC1(C)COC1=O,1,Ames_Derived,Ames Mutagenicity Dataset,Name: PIVALOLACTONE | CAS: 1955-45-9 | Mutag...,ULKFLOVGORAZDI-UHFFFAOYSA-N,ULKFLOVGORAZDI,[H]C([H])([H])C1(C([H])([H])[H])C(=O)OC1([H])[H]
2,CN(C)c1ccc(Cc2ccc(N(C)C)cc2)cc1,0,Ames_Derived,Ames Mutagenicity Dataset,"Name: 4,4'-METHYLENEBIS(N,N-DIMETHYLBENZENAMI...",JNRLEMMIVRBKJE-UHFFFAOYSA-N,JNRLEMMIVRBKJE,[H]c1c([H])c(C([H])([H])c2c([H])c([H])c(N(C([H...
3,CCN(N=O)c1ccccc1,0,Ames_Derived,Ames Mutagenicity Dataset,Name: N-NITROSO-N-ETHYLANILINE | CAS: 612-64...,WXRXVZXYLBWKRG-UHFFFAOYSA-N,WXRXVZXYLBWKRG,[H]c1c([H])c([H])c(N(N=O)C([H])([H])C([H])([H]...
4,CN(C)c1ccc(C(=O)c2ccc(N(C)C)cc2)cc1,0,Ames_Derived,Ames Mutagenicity Dataset,Name: MICHLER'S KETONE | CAS: 90-94-8 | Muta...,VVBLNCFGVYUYGU-UHFFFAOYSA-N,VVBLNCFGVYUYGU,[H]c1c([H])c(N(C([H])([H])[H])C([H])([H])[H])c...


In [4]:
# Extract necessary columns
smiles = data['smiles_r']
target = data['toxicity']

# Convert target to binary (assuming 1 positive, 0 negative)
# If dataset contains -1 as negative, map appropriately
target = target.replace(-1, 0)


In [5]:
# Generate Morgan fingerprints
def mol_to_fp(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=int)
    from rdkit import DataStructs
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

fps = []
for s in tqdm(smiles):
    fp_arr = mol_to_fp(s)
    if fp_arr is not None:
        fps.append(fp_arr)
    else:
        fps.append(np.zeros(2048))  # Handle invalid SMILES

X = np.array(fps)


100%|██████████| 5536/5536 [00:00<00:00, 6315.59it/s]


In [6]:
# Split data and balance classes
X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, random_state=42, stratify=target)

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

print(f'Training set size before balancing: {len(y_train)}')
print(f'Training set size after balancing: {len(y_train_bal)}')


Training set size before balancing: 4428
Training set size after balancing: 4964


In [7]:
# Train models
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_bal, y_train_bal)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_bal, y_train_bal)

lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_bal, y_train_bal)


[LightGBM] [Info] Number of positive: 2482, number of negative: 2482
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008549 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2228
[LightGBM] [Info] Number of data points in the train set: 4964, number of used features: 1114
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [12]:
# generate_rdkit_descriptors_csv.py
# ✅ Adds MolWt, LogP, TPSA, HDonors, HAcceptors, RotBonds, HeavyAtoms
# ✅ Automatically handles SMILES and InChI columns
# ✅ Saves updated dataset with descriptors

import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcTPSA

# -------------------- CONFIG --------------------
INPUT_PATH = "Ames_Mutagenicity_sorted.xlsx"
OUTPUT_PATH = "Ames_Mutagenicity_with_descs_only.csv"

# -------------------- LOAD DATA --------------------
df = pd.read_excel(INPUT_PATH)
smiles_col = next((c for c in df.columns if "smile" in c.lower()), None)
inchi_col = next((c for c in df.columns if "inchi" in c.lower()), None)

if smiles_col is None:
    raise RuntimeError("❌ Could not find a SMILES column in your Excel file!")

print(f"✅ Detected columns -> SMILES: {smiles_col}, InChI: {inchi_col}")

# -------------------- SAFE MOLECULE PARSING --------------------
def mol_from_any(row):
    mol = None
    try:
        mol = Chem.MolFromSmiles(str(row[smiles_col]))
    except:
        pass
    if mol is None and inchi_col and pd.notna(row[inchi_col]):
        try:
            mol = Chem.MolFromInchi(str(row[inchi_col]))
        except:
            pass
    return mol

# -------------------- DESCRIPTOR CALCULATION --------------------
def compute_rdkit_descs(mol):
    """Compute main RDKit physicochemical descriptors."""
    return {
        "MolWt": Descriptors.ExactMolWt(mol),
        "MolLogP": Descriptors.MolLogP(mol),
        "TPSA": CalcTPSA(mol),
        "NumHDonors": Descriptors.NumHDonors(mol),
        "NumHAcceptors": Descriptors.NumHAcceptors(mol),
        "NumRotatableBonds": Descriptors.NumRotatableBonds(mol),
        "HeavyAtomCount": mol.GetNumHeavyAtoms(),
    }

desc_rows = []
valid_mask = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Computing descriptors"):
    mol = mol_from_any(row)
    if mol:
        desc_rows.append(compute_rdkit_descs(mol))
        valid_mask.append(1)
    else:
        desc_rows.append({
            "MolWt": np.nan, "MolLogP": np.nan, "TPSA": np.nan,
            "NumHDonors": np.nan, "NumHAcceptors": np.nan,
            "NumRotatableBonds": np.nan, "HeavyAtomCount": np.nan
        })
        valid_mask.append(0)

# -------------------- COMBINE + SAVE --------------------
desc_df = pd.DataFrame(desc_rows)
df["Parsed_OK"] = valid_mask
final_df = pd.concat([df.reset_index(drop=True), desc_df.reset_index(drop=True)], axis=1)

final_df.to_csv(OUTPUT_PATH, index=False)
print(f"\n✅ Descriptors computed and file saved → {OUTPUT_PATH}")
print(f"✅ Total molecules parsed successfully: {sum(valid_mask)}/{len(df)}")


✅ Detected columns -> SMILES: smiles_r, InChI: inchikey


Computing descriptors: 100%|██████████| 5536/5536 [00:02<00:00, 2174.65it/s]


✅ Descriptors computed and file saved → Ames_Mutagenicity_with_descs_only.csv
✅ Total molecules parsed successfully: 5536/5536





In [9]:
# improved_toxicity_pipeline_v1.py
# ✅ Uses SMILES + InChI + RDKit descriptors + fingerprints + SMOTE + Optuna-tuned stacking ensemble

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.Chem.rdMolDescriptors import CalcTPSA
from rdkit import DataStructs

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna

# -------------------- CONFIG --------------------
FILE_PATH = "Ames_Mutagenicity_sorted.xlsx"
RANDOM_STATE = 42
FP_SIZE = 2048
RADIUS = 2
N_SPLITS = 5
OPTUNA_TRIALS = 40

# -------------------- LOAD DATA --------------------
data = pd.read_excel(FILE_PATH)
print(f"✅ Loaded {len(data)} samples")

# Detect SMILES and label
smiles_col = next((c for c in data.columns if "smile" in c.lower()), None)
label_col = next((c for c in data.columns if any(x in c.lower() for x in ["tox", "label", "class", "mutagenicity"])), None)
inchi_col = next((c for c in data.columns if "inchi" in c.lower()), None)

if smiles_col is None or label_col is None:
    raise RuntimeError("❌ Could not find SMILES or toxicity label column. Please rename appropriately.")

print(f"Using columns: SMILES={smiles_col}, LABEL={label_col}, InChI={inchi_col}")

# Convert label
data[label_col] = data[label_col].replace(-1, 0).astype(int)

# -------------------- RDKit helpers --------------------
def mol_from_smiles(s):
    try:
        return Chem.MolFromSmiles(str(s))
    except:
        return None

def mol_from_inchi(i):
    try:
        return Chem.MolFromInchi(str(i))
    except:
        return None

def compute_rdkit_descs(mol):
    """Compute core molecular descriptors."""
    return [
        Descriptors.ExactMolWt(mol),
        Descriptors.MolLogP(mol),
        CalcTPSA(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumRotatableBonds(mol)
    ]

def mol_to_fp(mol, radius=RADIUS, n_bits=FP_SIZE):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# -------------------- Generate features --------------------
fps, descs, valid_idx = [], [], []

for i, row in tqdm(data.iterrows(), total=len(data), desc="Generating descriptors"):
    mol = mol_from_smiles(row[smiles_col])
    if mol is None and inchi_col and pd.notna(row[inchi_col]):
        mol = mol_from_inchi(row[inchi_col])
    if mol is None:
        continue
    descs.append(compute_rdkit_descs(mol))
    fps.append(mol_to_fp(mol))
    valid_idx.append(i)

df = data.iloc[valid_idx].reset_index(drop=True)
X_fp = np.vstack(fps)
X_desc = np.array(descs, dtype=float)
y = df[label_col].values

# Descriptor scaling
scaler = RobustScaler()
X_desc_scaled = scaler.fit_transform(X_desc)
X = np.hstack([X_fp, X_desc_scaled])

print(f"✅ Features: {X.shape} | Valid molecules: {len(valid_idx)} | Pos/Neg: {np.bincount(y)}")

# -------------------- Split + SMOTE --------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)
sm = SMOTE(random_state=RANDOM_STATE)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

print(f"Balanced training size: {len(y_train_bal)}")

# -------------------- Base Models --------------------
rf = RandomForestClassifier(
    n_estimators=600, max_depth=14, random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced_subsample'
)
xgb = XGBClassifier(
    n_estimators=800, learning_rate=0.03, max_depth=8, subsample=0.9, colsample_bytree=0.8,
    reg_lambda=1.0, random_state=RANDOM_STATE, eval_metric="logloss", tree_method="hist"
)
lgbm = lgb.LGBMClassifier(
    n_estimators=900, learning_rate=0.03, num_leaves=64, subsample=0.9, colsample_bytree=0.8,
    random_state=RANDOM_STATE, class_weight="balanced"
)
cat = CatBoostClassifier(
    iterations=900, learning_rate=0.03, depth=8, random_seed=RANDOM_STATE,
    l2_leaf_reg=5, verbose=0, class_weights=[1, 1]
)

# -------------------- Generate OOF for Stacking --------------------
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
oof_preds = np.zeros((X_train_bal.shape[0], 4))
test_preds = np.zeros((X_test.shape[0], 4))

for i, (name, est) in enumerate([("rf", rf), ("xgb", xgb), ("lgb", lgbm), ("cat", cat)]):
    print(f"Training base model: {name}")
    oof = np.zeros(X_train_bal.shape[0])
    test_fold = np.zeros((X_test.shape[0], N_SPLITS))
    for fold, (tr_idx, val_idx) in enumerate(cv.split(X_train_bal, y_train_bal)):
        X_tr, X_val = X_train_bal[tr_idx], X_train_bal[val_idx]
        y_tr, y_val = y_train_bal[tr_idx], y_train_bal[val_idx]
        e = est.__class__(**est.get_params())
        e.fit(X_tr, y_tr)
        oof[val_idx] = e.predict_proba(X_val)[:, 1]
        test_fold[:, fold] = e.predict_proba(X_test)[:, 1]
    oof_preds[:, i] = oof
    test_preds[:, i] = test_fold.mean(axis=1)

# -------------------- Optuna-tuned Meta Model --------------------
def objective(trial):
    params = {
        "C": trial.suggest_float("C", 0.001, 10.0, log=True),
        "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
    }
    meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE, **params)
    f1s = []
    for tr_idx, val_idx in cv.split(oof_preds, y_train_bal):
        meta.fit(oof_preds[tr_idx], y_train_bal[tr_idx])
        preds = (meta.predict_proba(oof_preds[val_idx])[:, 1] >= 0.5).astype(int)
        f1s.append(f1_score(y_train_bal[val_idx], preds))
    return np.mean(f1s)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=OPTUNA_TRIALS)
best_params = study.best_params
print("Best Meta Params:", best_params)

meta = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE, **best_params)
meta.fit(oof_preds, y_train_bal)
meta_preds = meta.predict_proba(test_preds)[:, 1]

# -------------------- Threshold Optimization --------------------
from sklearn.metrics import precision_recall_curve
p, r, th = precision_recall_curve(y_test, meta_preds)
f1s = 2 * p * r / (p + r + 1e-12)
best_thr = float(th[np.nanargmax(f1s)]) if len(th) > 0 else 0.5
y_pred = (meta_preds >= best_thr).astype(int)

# -------------------- Results --------------------
print("\n📊 FINAL PERFORMANCE")
print("ROC-AUC:", roc_auc_score(y_test, meta_preds))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# -------------------- Save Model --------------------
artifact = {
    "rf": rf, "xgb": xgb, "lgb": lgbm, "cat": cat,
    "meta": meta, "scaler": scaler, "threshold": best_thr,
    "features": ["SMILES", "InChI", "MorganFP", "Descriptors"]
}
joblib.dump(artifact, "best_toxicity_ensemble_v1.pkl")
print("\n✅ Model saved → best_toxicity_ensemble_v1.pkl")


✅ Loaded 5536 samples
Using columns: SMILES=smiles_r, LABEL=toxicity, InChI=inchikey


Generating descriptors: 100%|██████████| 5536/5536 [00:03<00:00, 1742.37it/s]


✅ Features: (5536, 2054) | Valid molecules: 5536 | Pos/Neg: [2433 3103]
Balanced training size: 4964
Training base model: rf
Training base model: xgb
Training base model: lgb
[LightGBM] [Info] Number of positive: 1985, number of negative: 1986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5492
[LightGBM] [Info] Number of data points in the train set: 3971, number of used features: 1077
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 1985, number of negative: 1986
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008007 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5269
[LightGBM] [Info] Number of data points in the train set: 3971, numbe

[I 2025-10-24 15:22:26,825] A new study created in memory with name: no-name-1dd886d7-7658-43ae-990b-6b37dd79bc3a
[I 2025-10-24 15:22:26,860] Trial 0 finished with value: 0.8510945823362028 and parameters: {'C': 0.27688174445477237, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8510945823362028.
[I 2025-10-24 15:22:26,882] Trial 1 finished with value: 0.8521199153510761 and parameters: {'C': 2.4160111914098206, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8521199153510761.
[I 2025-10-24 15:22:26,899] Trial 2 finished with value: 0.8522822787352261 and parameters: {'C': 5.926716420472112, 'solver': 'liblinear'}. Best is trial 2 with value: 0.8522822787352261.
[I 2025-10-24 15:22:26,916] Trial 3 finished with value: 0.8499503247156579 and parameters: {'C': 0.002512178581230851, 'solver': 'lbfgs'}. Best is trial 2 with value: 0.8522822787352261.
[I 2025-10-24 15:22:26,932] Trial 4 finished with value: 0.8519723382244054 and parameters: {'C': 2.220306441008762, 'solver': 'liblinea

Best Meta Params: {'C': 4.791748371055561, 'solver': 'liblinear'}

📊 FINAL PERFORMANCE
ROC-AUC: 0.9158210080449167
Accuracy: 0.8384476534296029
Precision: 0.8318318318318318
Recall: 0.8921095008051529
F1 Score: 0.8609168609168609
              precision    recall  f1-score   support

           0       0.85      0.77      0.81       487
           1       0.83      0.89      0.86       621

    accuracy                           0.84      1108
   macro avg       0.84      0.83      0.83      1108
weighted avg       0.84      0.84      0.84      1108


✅ Model saved → best_toxicity_ensemble_v1.pkl


In [11]:
# -------------------- CONFIG --------------------
FILE_PATH = "Ames_Mutagenicity_sorted.xlsx"
RADIUS = 2
N_BITS = 2048
RANDOM_STATE = 42

# -------------------- LOAD DATA --------------------
data = pd.read_excel(FILE_PATH)
smiles_col = next((c for c in data.columns if "smile" in c.lower()), None)
label_col = next((c for c in data.columns if any(x in c.lower() for x in ["tox", "label", "mutagenicity"])), None)
inchi_col = next((c for c in data.columns if "inchi" in c.lower()), None)

if smiles_col is None or label_col is None:
    raise RuntimeError("❌ Could not detect SMILES or toxicity label column.")

print(f"✅ Using columns: SMILES={smiles_col}, LABEL={label_col}, InChI={inchi_col}")

# Ensure labels are binary (0/1)
data[label_col] = data[label_col].replace(-1, 0).astype(int)

# -------------------- FEATURE EXTRACTION --------------------
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcTPSA
from rdkit import DataStructs

def mol_from_any(row):
    mol = Chem.MolFromSmiles(str(row[smiles_col]))
    if mol is None and inchi_col and pd.notna(row[inchi_col]):
        mol = Chem.MolFromInchi(str(row[inchi_col]))
    return mol

def mol_to_fp(mol, radius=RADIUS, n_bits=N_BITS):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def compute_descriptors(mol):
    return [
        Descriptors.ExactMolWt(mol),
        Descriptors.MolLogP(mol),
        CalcTPSA(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumRotatableBonds(mol)
    ]

fps, descs, valid_idx = [], [], []
for i, row in tqdm(data.iterrows(), total=len(data), desc="Generating fingerprints"):
    mol = mol_from_any(row)
    if mol is None:
        continue
    fps.append(mol_to_fp(mol))
    descs.append(compute_descriptors(mol))
    valid_idx.append(i)

df = data.iloc[valid_idx].reset_index(drop=True)
X_fp = np.vstack(fps)
X_desc = np.array(descs, dtype=float)
y = df[label_col].values

print(f"✅ Molecules parsed: {len(valid_idx)} | Feature shape: {X_fp.shape}")

# Combine FP + descriptors
X = np.hstack([X_fp, X_desc])

# -------------------- SPLIT + BALANCE --------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
sm = SMOTE(random_state=RANDOM_STATE)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)
print(f"Training size after SMOTE: {len(y_train_bal)}")

# -------------------- MODELS --------------------
rf = RandomForestClassifier(
    n_estimators=600, max_depth=14, random_state=RANDOM_STATE, class_weight="balanced_subsample", n_jobs=-1
)
xgb = XGBClassifier(
    n_estimators=800, learning_rate=0.03, max_depth=8, subsample=0.9, colsample_bytree=0.8,
    eval_metric="logloss", random_state=RANDOM_STATE, tree_method="hist"
)
lgbm = LGBMClassifier(
    n_estimators=800, learning_rate=0.03, num_leaves=64, subsample=0.9, colsample_bytree=0.8,
    random_state=RANDOM_STATE, class_weight="balanced"
)

# -------------------- TRAIN --------------------
rf.fit(X_train_bal, y_train_bal)
xgb.fit(X_train_bal, y_train_bal)
lgbm.fit(X_train_bal, y_train_bal)

# -------------------- EVALUATION --------------------
def evaluate(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{name}:  Accuracy={acc:.3f} | Precision={prec:.3f} | Recall={rec:.3f} | F1={f1:.3f}")
    return f1

print("\n📊 Model Performance Summary:")
f1_rf = evaluate(rf, X_test, y_test, "Random Forest")
f1_xgb = evaluate(xgb, X_test, y_test, "XGBoost")
f1_lgbm = evaluate(lgbm, X_test, y_test, "LightGBM")

# -------------------- ENSEMBLE PREDICTION --------------------
rf_probs = rf.predict_proba(X_test)[:, 1]
xgb_probs = xgb.predict_proba(X_test)[:, 1]
lgb_probs = lgbm.predict_proba(X_test)[:, 1]
avg_probs = (rf_probs + xgb_probs + lgb_probs) / 3
y_ens = (avg_probs >= 0.5).astype(int)

acc = accuracy_score(y_test, y_ens)
prec = precision_score(y_test, y_ens)
rec = recall_score(y_test, y_ens)
f1 = f1_score(y_test, y_ens)
print("\n🧠 Ensemble Model:")
print(f"Accuracy={acc:.3f} | Precision={prec:.3f} | Recall={rec:.3f} | F1={f1:.3f}")


✅ Using columns: SMILES=smiles_r, LABEL=toxicity, InChI=inchikey


Generating fingerprints: 100%|██████████| 5536/5536 [00:02<00:00, 1876.11it/s]


✅ Molecules parsed: 5536 | Feature shape: (5536, 2048)
Training size after SMOTE: 4964
[LightGBM] [Info] Number of positive: 2482, number of negative: 2482
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009891 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10070
[LightGBM] [Info] Number of data points in the train set: 4964, number of used features: 1326
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

📊 Model Performance Summary:
Random Forest:  Accuracy=0.785 | Precision=0.868 | Recall=0.728 | F1=0.792
XGBoost:  Accuracy=0.831 | Precision=0.864 | Recall=0.829 | F1=0.846
LightGBM:  Accuracy=0.846 | Precision=0.868 | Recall=0.855 | F1=0.861

🧠 Ensemble Model:
Accuracy=0.842 | Precision=0.874 | Recall=0.839 | F1=0.856
