In [None]:
import rdkit
import PyTDC

In [None]:
from tdc.single_pred import Tox
data = Tox(name = 'hERG_Karim')
split = data.get_split()

In [None]:
# Clean Data

import pandas as pd
from rdkit import Chem

# helper function to check SMILE strings from dataset w/ RDKit
def validate_smiles(smiles):
    try:
        # converts SMILES to molecular obj
        mol = Chem.MolFromSmiles(smiles)

        # checks if molecular object is valid and returns true if it is
        return mol is not None

    #else false
    except:
        return False

# helper function to clean the dataset splits
def clean_data(split):
    cleaned_split = {}


    for key, df in split.items():
        # removes dupes
        df = df.drop_duplicates()

        # fills in missing vals w/ median & unknown column
        for col in df.columns:
            if df[col].dtype in ['float64', 'int64']:
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df[col].fillna('Unknown')

        # removes xtra space if there is any and converts binary column of Y into integers
        df['Drug'] = df['Drug'].astype(str)
        df['Drug'] = df['Drug'].str.strip()
        df['Y'] = df['Y'].astype(int)

        # checks SMILES strs and keep only valid ones that are in RDKit
        df['Valid_SMILES'] = df['Drug'].apply(validate_smiles)
        df = df[df['Valid_SMILES']]
        df = df.drop(columns=['Valid_SMILES'])

        #append cleaned data splits onto list
        cleaned_split[key] = df

    return cleaned_split

#checking cleaned dataset splits
cleaned_split = clean_data(split)
print(cleaned_split)

In [None]:
# set up test and train set + morgan fingerprints

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from rdkit.Chem import AllChem

# create morgan fingerprints for SMILES using RDKit
def generate_fingerprints(smiles):

    # create molecule from smiles strings
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
      # radius of 2 to check interactions 2 bonds away, 2048 bits is
      #(makes the ifngerprints 2048 bits), could use 1024 bits as well but it lowers f1 score
      fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)

      # turns fingerprint into a list of binary numbers
      return list(fp)
    else:
      # no bits for hte ifngerprints if SMILES is invalid
      return [0] * 2048

# FEATURE ENGINEERING on training dataset
train = cleaned_split['train']
# create fingerprints for each molecules in the training dataset
train_fingerprints = train['Drug'].apply(generate_fingerprints).to_list()
# turn fingerprints into a df so the code can run
X_train = pd.DataFrame(train_fingerprints)
# retrieve the Y column of the dataset that has labels (blockers = 1, not a blocker = 0)
y_train = train['Y']

# same thing as above but on the test dataset
test = cleaned_split['test']
test_fingerprints = test['Drug'].apply(generate_fingerprints).to_list()
X_test = pd.DataFrame(test_fingerprints)
y_test = test['Y']

# First GridSearch then RandomSearch did not execute

In [None]:
# Avoid Running
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# define the model
rf_model = RandomForestClassifier(random_state=42)

# define a reduced parameter distribution
param_dist = {
    'n_estimators': [250, 500, 750],  # Avoid very large numbers
    'max_depth': [None, 10, 20],  # Focus on reasonable values
    'min_samples_split': [2, 5],  # Smaller range
    'min_samples_leaf': [1, 2],  # Smaller range
    'bootstrap': [True, False]
}

# reduce the number of iterations and folds to save time
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=20,  # Fewer iterations
    scoring='accuracy',
    cv=3,  # Reduce the number of cross-validation folds
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# fit the model with a subset of the data if necessary
# X_train_sample = X_train[:3000]  # Use a smaller training sample if your dataset is large
# y_train_sample = y_train[:3000]

random_search.fit(X_train, y_train)

# best parameters and the best model
best_params = random_search.best_params_
best_rf_model = random_search.best_estimator_

print(f"Best Parameters: {best_params}")

# evaluate on the full test data
y_pred = best_rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Avoid Running
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# define the parameter grid for random search
param_dist = {
    'n_estimators': [100, 300, 500],
    'learning_rate': np.linspace(0.01, 0.1, 3),
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# create the GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)

# set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter settings sampled
    scoring='accuracy',  # Metric to optimize
    cv=3,  # Number of cross-validation folds
    random_state=42,
)

# perform the random search
random_search.fit(X_train, y_train)

# print the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)

# evaluate the model on the test set
best_gb_model = random_search.best_estimator_
y_pred_gb = best_gb_model.predict(X_test)
print("Gradient Boosting Classifier Performance (after Randomized Search):")
print(classification_report(y_test, y_pred_gb))
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Test Accuracy: {accuracy_gb}\n")