In [None]:
# dependencies to do

In [None]:
from tdc.single_pred import Tox
data = Tox(name = 'hERG_Karim')
split = data.get_split()

In [None]:
# Clean Data

import pandas as pd
from rdkit import Chem

# helper function to check SMILE strings from dataset w/ RDKit
def validate_smiles(smiles):
    try:
        # converts SMILES to molecular obj
        mol = Chem.MolFromSmiles(smiles)

        # checks if molecular object is valid and returns true if it is
        return mol is not None

    #else false
    except:
        return False

# helper function to clean the dataset splits
def clean_data(split):
    cleaned_split = {}


    for key, df in split.items():
        # removes dupes
        df = df.drop_duplicates()

        # fills in missing vals w/ median & unknown column
        for col in df.columns:
            if df[col].dtype in ['float64', 'int64']:
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df[col].fillna('Unknown')

        # removes xtra space if there is any and converts binary column of Y into integers
        df['Drug'] = df['Drug'].astype(str)
        df['Drug'] = df['Drug'].str.strip()
        df['Y'] = df['Y'].astype(int)

        # checks SMILES strs and keep only valid ones that are in RDKit
        df['Valid_SMILES'] = df['Drug'].apply(validate_smiles)
        df = df[df['Valid_SMILES']]
        df = df.drop(columns=['Valid_SMILES'])

        #append cleaned data splits onto list
        cleaned_split[key] = df

    return cleaned_split

#checking cleaned dataset splits
cleaned_split = clean_data(split)
print(cleaned_split)

In [None]:
# I was just checking the balance - Jackson
def check_balance(cleaned_split):
    for key, df in cleaned_split.items():
        print(f"Split: {key}")
        y_distribution = df['Y'].value_counts(normalize=True)
        print(y_distribution)
        if (y_distribution.min() < 0.4) or (y_distribution.max() > 0.6):
            print("unbalanced")
        else:
            print("balanced")

check_balance(cleaned_split)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from rdkit.Chem import AllChem

# create morgan fingerprints for SMILES using RDKit
def generate_fingerprints(smiles):

    # create molecule from smiles strings
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
      # radius of 2 to check interactions 2 bonds away, 2048 bits is
      #(makes the ifngerprints 2048 bits), could use 1024 bits as well but it lowers f1 score
      fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)

      # turns fingerprint into a list of binary numbers
      return list(fp)
    else:
      # no bits for hte ifngerprints if SMILES is invalid
      return [0] * 2048

# FEATURE ENGINEERING on training dataset
train = cleaned_split['train']
# create fingerprints for each molecules in the training dataset
train_fingerprints = train['Drug'].apply(generate_fingerprints).to_list()
# turn fingerprints into a df so the code can run
X_train = pd.DataFrame(train_fingerprints)
# retrieve the Y column of the dataset that has labels (blockers = 1, not a blocker = 0)
y_train = train['Y']

# same thing as above but on the test dataset
test = cleaned_split['test']
test_fingerprints = test['Drug'].apply(generate_fingerprints).to_list()
X_test = pd.DataFrame(test_fingerprints)
y_test = test['Y']

In [None]:
# Random Forest DO NOT RUN
# random state = 42 so we can run the code over and over again and get the same values
# n_estimators = 500 which is the # of decision trees in the forest
# we might still need to finetune the model with these paramters, but so far it's doing pretty good
rf_model = RandomForestClassifier(random_state=42, n_estimators=500, )
# train the model on the training dataset
rf_model.fit(X_train, y_train)

# make predictions on test set
y_pred = rf_model.predict(X_test)

# model performnace printed out
# class report says that model is doing pretty well
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Random Forest to optimize hyperparameters
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, 
                                   n_iter=100, scoring='accuracy', cv=5, verbose=2, random_state=42, n_jobs=-1)

# Train with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = random_search.best_params_
best_rf_model = random_search.best_estimator_

print(f"Best Parameters: {best_params}")

# Evaluate on test data
y_pred = best_rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# SVM DO NOT RUN
from sklearn.svm import SVC

svm_model = SVC(random_state=42, kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("Support Vector Machine Performance:")
print(classification_report(y_test, y_pred_svm))
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy_svm}\n")

In [None]:
# Gradient Boosting DO NOT RUN
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=42, n_estimators=300, learning_rate=0.01, max_depth=3)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
print("Gradient Boosting Classifier Performance:")
print(classification_report(y_test, y_pred_gb))
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Accuracy: {accuracy_gb}\n")

In [None]:
# Gradient Boosting to optimize hyperparameters
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint, uniform

# Define the model
gb_model = GradientBoostingClassifier(random_state=42)

# Define the hyperparameter distribution
param_dist = {
    'n_estimators': randint(100, 500),  # Number of boosting stages
    'learning_rate': uniform(0.01, 0.2),  # Learning rate
    'max_depth': randint(3, 10),  # Maximum depth of individual estimators
    'min_samples_split': randint(2, 10),  # Minimum number of samples required to split
    'min_samples_leaf': randint(1, 5),  # Minimum number of samples at leaf nodes
    'subsample': uniform(0.7, 0.3),  # Fraction of samples to use for fitting
    'max_features': ['auto', 'sqrt', 'log2', None],  # Number of features considered for the best split
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=gb_model, param_distributions=param_dist,
                                   n_iter=100, scoring='accuracy', cv=5, verbose=2,
                                   random_state=42, n_jobs=-1)

# Train with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = random_search.best_params_
best_gb_model = random_search.best_estimator_

print(f"Best Parameters: {best_params}")

# Evaluate the best model on the test set
y_pred_gb = best_gb_model.predict(X_test)
print("Gradient Boosting Classifier Performance after Hyperparameter Tuning:")
print(classification_report(y_test, y_pred_gb))
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Accuracy: {accuracy_gb}\n")
