In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import optuna
import pickle

In [2]:
# Load dataset
# Load data from CSV
df = pd.read_csv('Selected_fingerprints.csv')

### 5. Define and Optimize the Model with Optuna

X = df.drop(['IC50_label', 'smiles'], axis=1)
y = df['IC50_label']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.metrics import accuracy_score

In [9]:
### 5. Define and Optimize the Model with Optuna

### 4. *Logistic Regression*
import pickle
from sklearn.linear_model import LogisticRegression

def objective_logistic(trial):
    param = {
        'C': trial.suggest_loguniform('C', 0.01, 10),
        'solver': trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear'])
    }
    model = LogisticRegression(**param, max_iter=500)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study_logistic = optuna.create_study(direction='maximize')
study_logistic.optimize(objective_logistic, n_trials=1000)

# Retrieve the best model and save it
best_params_logistic = study_logistic.best_params
best_model_logistic = LogisticRegression(**best_params_logistic, max_iter=1000)
best_model_logistic.fit(X_train, y_train)

[I 2024-09-22 22:29:05,333] A new study created in memory with name: no-name-81149580-e5ef-4e0a-861d-60d9de74725c
  'C': trial.suggest_loguniform('C', 0.01, 10),
[I 2024-09-22 22:29:17,826] Trial 0 finished with value: 0.7641313050440353 and parameters: {'C': 0.40528965268964134, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.7641313050440353.
  'C': trial.suggest_loguniform('C', 0.01, 10),
[I 2024-09-22 22:29:26,575] Trial 1 finished with value: 0.7642914331465173 and parameters: {'C': 0.12117089176261614, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7642914331465173.
  'C': trial.suggest_loguniform('C', 0.01, 10),
[I 2024-09-22 22:29:38,009] Trial 2 finished with value: 0.7606084867894315 and parameters: {'C': 0.04384984644482565, 'solver': 'newton-cg'}. Best is trial 1 with value: 0.7642914331465173.
  'C': trial.suggest_loguniform('C', 0.01, 10),
[I 2024-09-22 22:29:51,060] Trial 3 finished with value: 0.7625300240192153 and parameters: {'C': 3.9387955047227607, 'solver': '

In [11]:
print("Best parameters for Logistic Regression:", best_params_logistic)
print("Best accuracy for Logistic Regression:", study_logistic.best_value)

Best parameters for Logistic Regression: {'C': 0.17496662701821752, 'solver': 'liblinear'}
Best accuracy for Logistic Regression: 0.766853482786229
