In [None]:
import os
from qsprpred.models import SklearnModel, OptunaOptimization, CrossValAssessor
from sklearn.ensemble import RandomForestClassifier
from qsprpred.data import QSPRDataset, RandomSplit, BootstrapSplit
from sklearn.model_selection import StratifiedShuffleSplit
from qsprpred.data.descriptors.fingerprints import MorganFP

# Ensure output directory exists
os.makedirs("../output/models", exist_ok=True)

# Read receptor name
with open("../output/data/receptor.txt", "r") as file:
    receptor = file.read().strip()

# Load dataset
dataset = QSPRDataset.fromTableFile(
    filename=f"../output/data/{receptor}_Dataset.tsv",
    store_dir="../output/data",
    name=f"Splitting{receptor}Dataset",
    target_props=[{"name": "pchembl_value_Mean", "task": "SINGLECLASS", "th": [6.5]}],
    random_state=42
)

# Add Morgan fingerprints
dataset.addDescriptors([MorganFP(radius=3, nBits=2048)])
dataset.getDescriptors().shape

#check potential dataset split
def print_split(ds):
    train, test = ds.getFeatures()
    print(train.shape)
    print(test.shape)
    print(test.index)

print_split(dataset)

# Define different splits to compare
splits = {
    "random_split": RandomSplit(0.2),
    "stratified_split": StratifiedShuffleSplit(0.2),  # If dataset is imbalanced
    "bootstrap_split": BootstrapSplit(split=RandomSplit(0.2), n_bootstraps=5)
}

# Search space for hyperparameter tuning
search_space_rf = {"n_estimators": ["int", 10, 100, 250]}

# Store results
results = {}

for split_name, split_method in splits.items():
    print(f"\nTraining model using {split_name}...")

    # Define the model
    model = SklearnModel(
        base_dir=f"../output/models/{split_name}/",  # Separate directory per split
        alg=RandomForestClassifier,
        name=f"{split_name}_{receptor}_Model",
        random_state=dataset.randomState
    )

    # Define optimizer with current split method
    optimizer = OptunaOptimization(
        param_grid=search_space_rf,
        n_trials=1,  
        model_assessor=CrossValAssessor(
            scoring="roc_auc",
            split=split_method  # Use the current split method
        )
    )

    # Train & optimize the model
    optimizer.optimize(model, dataset)

    # Store model performance
    results[split_name] = 
    print(f"{split_name} performance: {results[split_name]}")

# Compare results across different splits
print("\nComparison of Splitting Strategies:")
for split, score in results.items():
    print(f"{split}: {score}")


[I 2025-03-06 10:24:21,567] A new study created in memory with name: no-name-74bdbf50-859a-4553-81c9-29750a7cc134


(12440, 2048)
(0, 2048)
Index([], dtype='object', name='QSPRID')

Training model using random_split...


[I 2025-03-06 10:24:35,925] Trial 0 finished with value: 0.6471652540894981 and parameters: {'n_estimators': 44}. Best is trial 0 with value: 0.6471652540894981.


AttributeError: 'SklearnModel' object has no attribute 'assessor'