In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import optuna
import pickle

In [4]:
# Load dataset
# Load data from CSV
df = pd.read_csv('Selected_fingerprints.csv')

### 5. Define and Optimize the Model with Optuna

X = df.drop(['IC50_label', 'smiles'], axis=1)
y = df['IC50_label']

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.metrics import accuracy_score

In [12]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pickle  # Ensure pickle is imported

def objective_dt(trial):
    param = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        # Removed 'auto' from max_features
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    }
    
    # Initialize the model with suggested hyperparameters
    model = DecisionTreeClassifier(**param, random_state=42)
    
    # Fit the model (Ensure X_train, y_train, X_test, y_test are defined)
    model.fit(X_train, y_train)
    
    # Predict and evaluate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

# Create and optimize the study
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective_dt, n_trials=200)

# Retrieve the best parameters
best_params_dt = study_dt.best_params
print("Best Parameters:", best_params_dt)

# Initialize and train the best model
best_model_dt = DecisionTreeClassifier(**best_params_dt, random_state=42)
best_model_dt.fit(X_train, y_train)

# Save the model
with open('best_model_dt.pkl', 'wb') as f:
    pickle.dump(best_model_dt, f)

[I 2024-09-22 23:33:40,461] A new study created in memory with name: no-name-fee5009e-8e5e-46d1-acef-87f649487643
[I 2024-09-22 23:33:41,058] Trial 0 finished with value: 0.677502001601281 and parameters: {'criterion': 'log_loss', 'max_depth': 35, 'min_samples_split': 13, 'min_samples_leaf': 13, 'max_features': 'log2'}. Best is trial 0 with value: 0.677502001601281.
[I 2024-09-22 23:33:41,648] Trial 1 finished with value: 0.5951961569255404 and parameters: {'criterion': 'log_loss', 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 12, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.677502001601281.
[I 2024-09-22 23:33:42,173] Trial 2 finished with value: 0.655404323458767 and parameters: {'criterion': 'gini', 'max_depth': 28, 'min_samples_split': 19, 'min_samples_leaf': 12, 'max_features': 'log2'}. Best is trial 0 with value: 0.677502001601281.
[I 2024-09-22 23:33:42,800] Trial 3 finished with value: 0.676861489191353 and parameters: {'criterion': 'entropy', 'max_depth

Best Parameters: {'criterion': 'gini', 'max_depth': 40, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': None}
