In [4]:
!pip install pandas rdkit scikit-learn optuna



In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import optuna
import pickle

In [3]:
# Load dataset
# Load data from CSV
df = pd.read_csv('Selected_fingerprints.csv')

### 5. Define and Optimize the Model with Optuna

X = df.drop(['IC50_label', 'smiles'], axis=1)
y = df['IC50_label']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.metrics import accuracy_score

In [13]:
### 5. Define and Optimize the Model with Optuna

### 1. *XGBoost*
import pickle
from xgboost import XGBClassifier

def objective_xgb(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
    }
    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=1000)

# Retrieve the best model and save it
best_params_xgb = study_xgb.best_params
best_model_xgb = XGBClassifier(**best_params_xgb)
best_model_xgb.fit(X_train, y_train)



[I 2024-09-20 23:05:18,776] A new study created in memory with name: no-name-c64e5844-56eb-401c-80c4-6f499949bcc8
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
[I 2024-09-20 23:05:34,827] Trial 0 finished with value: 0.7831865492393915 and parameters: {'max_depth': 5, 'n_estimators': 197, 'learning_rate': 0.08408485811924608, 'colsample_bytree': 0.8775361934084667}. Best is trial 0 with value: 0.7831865492393915.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
[I 2024-09-20 23:05:47,126] Trial 1 finished with value: 0.7662129703763011 and parameters: {'max_depth': 6, 'n_estimators': 147, 'learning_rate': 0.054250554342262246, 'colsample_bytree': 0.9442539152951449}. Best is trial 0 with value: 0.7831865492393915.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'c

In [11]:
with open('best_model_xgb.pkl', 'wb') as f:
    pickle.dump(best_model_xgb, f)



In [15]:
print("Best parameters for XGBoost:", best_params_xgb)
print("Best accuracy for XGBoost:", study_xgb.best_value)

Best parameters for XGBoost: {'max_depth': 10, 'n_estimators': 470, 'learning_rate': 0.09989610810226225, 'colsample_bytree': 0.7775642812655569}
Best accuracy for XGBoost: 0.8473979183346677
