In [9]:
from imblearn.over_sampling import SMOTE
import pandas as pd
from flaml import AutoML
from tdc import Evaluator
from sklearn.metrics import classification_report
import joblib

In [2]:
#Get data
columns_to_drop = ['Drug_ID', 'Drug', 'Y', 'key', 'input']
train_data = pd.read_csv('../../data/DrugTax/train_drugTax_featurized.csv').dropna()
valid_data = pd.read_csv('../../data/DrugTax/valid_drugTax_featurized.csv').dropna()
test_data = pd.read_csv('../../data/DrugTax/test_drugTax_featurized.csv').dropna()

#get splits
X_train, y_train = train_data.drop(columns=columns_to_drop).filter(regex='^(?!char_[.,=#@+\\-\\[\\(\\\\\/])'), train_data['Y']
X_test, y_test = test_data.drop(columns=columns_to_drop).filter(regex='^(?!char_[.,=#@+\\-\\[\\(\\\\\/])'), test_data['Y']
X_valid, y_valid = valid_data.drop(columns=columns_to_drop).filter(regex='^(?!char_[.,=#@+\\-\\[\\(\\\\\/])'), valid_data['Y']

#Use smote to oversample minority class
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)


In [3]:
#Train model using Flaml AutoML

model_config = {
    'task' : 'classification',  # classification 
    'time_budget' : 200,    # time budget in seconds
    'metric' : 'f1', # main metric to be optimized
    'estimator_list' : ['lgbm'] ,
    'eval_method': 'cv',  
    'n_splits': 5,
}

model = AutoML()
model.fit(X_res, y_res, **model_config) 

[flaml.automl.logger: 04-06 21:06:01] {1728} INFO - task = classification
[flaml.automl.logger: 04-06 21:06:01] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 04-06 21:06:01] {1838} INFO - Minimizing error metric: 1-f1
[flaml.automl.logger: 04-06 21:06:01] {1955} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 04-06 21:06:01] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 04-06 21:06:01] {2393} INFO - Estimated sufficient time budget=1907s. Estimated necessary time budget=2s.
[flaml.automl.logger: 04-06 21:06:01] {2442} INFO -  at 0.6s,	estimator lgbm's best error=0.2905,	best estimator lgbm's best error=0.2905
[flaml.automl.logger: 04-06 21:06:01] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 04-06 21:06:01] {2442} INFO -  at 0.7s,	estimator lgbm's best error=0.2836,	best estimator lgbm's best error=0.2836
[flaml.automl.logger: 04-06 21:06:01] {2258} INFO - iteration 2, current learner lgbm
[flaml.autom

In [4]:
#view the best configuration for each ML model
model.best_config_per_estimator

{'lgbm': {'n_estimators': 57,
  'num_leaves': 198,
  'min_child_samples': 6,
  'learning_rate': 0.08779515636942332,
  'log_max_bin': 9,
  'colsample_bytree': 0.7087614338457834,
  'reg_alpha': 0.001346442339014509,
  'reg_lambda': 0.00839141933486936}}

In [6]:
# Predict test data
y_pred = model.predict(X_test)

# Display metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.94      0.92       981
         1.0       0.54      0.40      0.46       181

    accuracy                           0.85      1162
   macro avg       0.72      0.67      0.69      1162
weighted avg       0.84      0.85      0.85      1162



In [7]:
#Evaluate model performance
from typing import Dict, Any
def evaluate_model(y_true, y_pred_proba, threshold: float = 0.5) -> Dict[str, float]:
    metrics = {
        'ROC-AUC': {'name': 'ROC-AUC', 'kwargs': {}},
        'PR-AUC': {'name': 'PR-AUC', 'kwargs': {}},
        'Accuracy': {'name': 'Accuracy', 'kwargs': {'threshold': threshold}},
        'Precision': {'name': 'Precision', 'kwargs': {'threshold': threshold}},
        'Recall': {'name': 'Recall', 'kwargs': {'threshold': threshold}},
        'F1': {'name': 'F1', 'kwargs': {'threshold': threshold}}
    }
    
    results = {}
    for metric_name, config in metrics.items():
        evaluator = Evaluator(name=config['name'])
        score = evaluator(y_true, y_pred_proba, **config['kwargs'])
        results[metric_name] = score
        print(f"{metric_name}: {score:.4f}")
    
    return results

y_proba = model.predict_proba(X_test)[:, 1]
y_true = y_test

evaluation_results = evaluate_model(y_true, y_proba)

ROC-AUC: 0.8246
PR-AUC: 0.5050
Accuracy: 0.8546
Precision: 0.5448
Recall: 0.4033
F1: 0.4635


In [10]:
#Save model

model = model
model_filename = 'Drugtax_trained_model.joblib'
joblib.dump(model, model_filename)

['Drugtax_trained_model.joblib']