In [7]:
from imblearn.over_sampling import SMOTE
import pandas as pd
from xgboost import XGBClassifier
from tdc import Evaluator
import joblib
from sklearn.metrics import classification_report

In [8]:
#Get data
train_data = pd.read_csv('../../data/ECE/tox21_train_featurized.csv').dropna()
valid_data = pd.read_csv('../../data/ECE/tox21_valid_featurized.csv').dropna()
test_data = pd.read_csv('../../data/ECE/tox21_test_featurized.csv').dropna()

#get splits
X_train, y_train = train_data.filter(regex='^feature.*'), train_data['Y']
X_test, y_test = test_data.filter(regex='^feature.*'), test_data['Y']
X_valid, y_valid = valid_data.filter(regex='^feature.*'), valid_data['Y']

#Use smote to oversample minority class
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [None]:
#Train the model
model = XGBClassifier(
    n_estimators=300,
    max_depth=7,               # Balance complexity
    learning_rate=0.05,        # Slower learning
    gamma=0.5,                 # Regularization
    subsample=0.8,             # Reduce overfitting
    colsample_bytree=0.8,
    scale_pos_weight=10,        # Adjust for class imbalance 
    random_state=42,
)

model.fit(X_res, y_res , eval_set=[(X_valid, y_valid)], verbose=True)

In [10]:
# Predict test data
y_test_pred = model.predict(X_test)

# Display metrics
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.90      0.92       981
         1.0       0.57      0.73      0.64       181

    accuracy                           0.87      1162
   macro avg       0.76      0.82      0.78      1162
weighted avg       0.89      0.87      0.88      1162



In [11]:
from typing import Dict, Any
def evaluate_model(y_true, y_pred_proba, threshold: float = 0.5) -> Dict[str, float]:
    metrics = {
        'ROC-AUC': {'name': 'ROC-AUC', 'kwargs': {}},
        'PR-AUC': {'name': 'PR-AUC', 'kwargs': {}},
        'Accuracy': {'name': 'Accuracy', 'kwargs': {'threshold': threshold}},
        'Precision': {'name': 'Precision', 'kwargs': {'threshold': threshold}},
        'Recall': {'name': 'Recall', 'kwargs': {'threshold': threshold}},
        'F1': {'name': 'F1', 'kwargs': {'threshold': threshold}}
    }
    
    results = {}
    for metric_name, config in metrics.items():
        evaluator = Evaluator(name=config['name'])
        score = evaluator(y_true, y_pred_proba, **config['kwargs'])
        results[metric_name] = score
        print(f"{metric_name}: {score:.4f}")
    
    return results

y_pred_proba = model.predict_proba(X_test)[:, 1]
y_true = y_test

evaluation_results = evaluate_model(y_true, y_pred_proba)

ROC-AUC: 0.8897
PR-AUC: 0.6754
Accuracy: 0.8718
Precision: 0.5684
Recall: 0.7348
F1: 0.6410


In [10]:
#Save model using joblib
import joblib

model = model
model_filename = 'ECE_trained_model.joblib' #ECE = Ersilia Compound Embeddings
joblib.dump(model, model_filename)

['ECE_trained_model.joblib']