In [1]:
import pandas as pd
import numpy as np
import os
import random
import deepchem as dc
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem
from sklearn import metrics
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score

Skipped loading some Pytorch utilities, missing a dependency. No module named 'torch'


This module requires PyTorch to be installed.


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!


Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


Skipped loading some PyTorch models, missing a dependency. No module named 'torch'
No module named 'torch'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'torch'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [6]:
#Importing ESOL Delaney solubility dataset 

def load_data():
    
    tasks, datasets, transformers = dc.molnet.load_delaney(featurizer="GraphConv", splitter="random")
    
    train_dataset, valid_dataset, test_dataset = datasets
    
    train_df = train_dataset.to_dataframe()
    valid_df = valid_dataset.to_dataframe()
    test_df = test_dataset.to_dataframe()
    
    smiles_train, sol_train = train_df["ids"], train_df["y"]
    smiles_test, sol_test = test_df["ids"], test_df["y"]
    smiles_valid, sol_valid = valid_df["ids"], valid_df["y"]

    return (smiles_train, sol_train), (smiles_valid, sol_valid), (smiles_test, sol_test)

In [9]:
#Featurise data

def featurise_data(smiles, solubility):
    
    featurizer = dc.feat.CircularFingerprint(size=2048, radius=4)
    
    X = featurizer.featurize(smiles)
    y = solubility

    return X, y 

In [10]:
#Train test split 

def splitting(X, y, smiles):

    seed = 400
    random.seed(seed)
    
    X_train, X_test, y_train, y_test, smiles_train, smiles_test = train_test_split(X, y, smiles, test_size=0.2, random_state=seed)

    return X_train, X_test, y_train, y_test, smiles_train, smiles_test, seed

In [11]:
#Train the model

def training(X_train, y_train, seed):

    reg = GradientBoostingRegressor(random_state=seed)
    reg.fit(X_train, y_train)
    
    score=(cross_val_score(reg, X_train, y_train, cv=3, n_jobs=-1).mean())
    print(f"Cross-validation score is: {score}")

    return score, reg

In [14]:
#Optional Optimisation

def optional_optimisation(reg, X_train, y_train):

    #Example optimisation parameters
    
    optimisation_param_grid = {
        "n_estimators": [10, 50, 100, 500],
        "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1.0],
        "max_depth":[3, 5, 7, 9],
        "min_samples_leaf": [1, 2, 3, 4, 5],
        "min_samples_split": [2, 4, 6, 8, 10], 
        "subsample": [0.6, 0.7, 0.8, 0.9, 1.0]
    }
    
    reg2 = GridSearchCV(reg, optimisation_param_grid, cv=3, n_jobs=-1)
    
    search = RandomizedSearchCV(reg, param_distributions= optimisation_param_grid, n_iter=100, cv=3, n_jobs=-1, verbose=2)
    search.fit(X_train, y_train)

    best_params = search.best_params_
    best_optimisation_score = search.best_score_
    
    print(f"Best Training Parametres: {best_params}")
    print(f"Best Training Score: {best_optimisation_score}")

    return search, reg2, best_params, best_optimisation_score

In [None]:
###Done this all wrong 

In [15]:
#Final training  

def final_train(reg, X_train, y_train):

    test_param_grid = {
        "n_estimators": [500],
        "learning_rate": [0.1],
        "max_depth":[7],
        "min_samples_leaf": [4],
        "min_samples_split": [6], 
        "subsample": [0.7]
    }
    
    test_regression = GridSearchCV(reg, test_param_grid, cv=3, n_jobs=-1)
    test_regression.fit(X_train, y_train)

    best_test_score = test_regression.best_score_
    
    print(f"Best Test Score: {best_test_score}")

    return best_test_score, test_regression

In [30]:
#Creating dataframes

def dataframe_creation(test_regression, y_test, smiles_test, X_train, y_train, smiles_train):

    test_data = {"Predicted Solubility logS [mol/L]" : test_regression.predict(X_test), 
                 "Actual Solubility logS [mol/L]" : y_test, 
                 "SMILES" : smiles_test
                }
    
    test_data["Residuals"] = test_data["Predicted Solubility logS [mol/L]"] - test_data["Actual Solubility logS [mol/L]"]
    test_dataframe = pd.DataFrame(data=test_data)
    
    train_data = {"Predicted Solubility logS [mol/L]" : reg.predict(X_train), 
                  "Actual Solubility logS [mol/L]" : y_train,
                  "SMILES" : smiles_train
                 }
    
    train_data["Residuals"] = train_data["Predicted Solubility logS [mol/L]"] - train_data["Actual Solubility logS [mol/L]"]
    train_dataframe = pd.DataFrame(data=train_data)

    return test_dataframe, train_dataframe

In [31]:
#Calculating Scores

def score_calculation(test_dataframe, train_dataframe):

    predicted_RMSE = root_mean_squared_error(test_dataframe["Predicted Solubility logS [mol/L]"], test_dataframe["Actual Solubility logS [mol/L]"])
    predicted_r2_score = r2_score(test_dataframe["Predicted Solubility logS [mol/L]"], test_dataframe["Actual Solubility logS [mol/L]"])
    
    print(f"The predicted RMSE is: {predicted_RMSE}")
    print(f"The predicted R2 score is: {predicted_r2_score}")

    return predicted_RMSE, predicted_r2_score

In [32]:
#Scatter graph 

def scatter_plot(test_dataframe, predicted_RMSE, predicted_r2_score):

    plt.scatter(test_dataframe["Predicted Solubility logS [mol/L]"], test_dataframe["Actual Solubility logS [mol/L]"], s=3) #x, y
    plt.title("Predicted molecular solubility vs measured solubility using gradient boosted trees")
    plt.xlabel("Estimated Solubility logS [mol/L]")
    plt.ylabel("Measured Solubility logS [mol/L]")
    plt.text(0.9, 0.2, 'R-squared = %.3f\nRMSE = %.3f' % (predicted_r2_score, predicted_RMSE))

In [33]:
#Residual Plot 

def residual_plot(test_dataframe):

    sns.residplot(x=test_dataframe["Predicted Solubility logS [mol/L]"], y=test_dataframe["Residuals"])
    plt.title("Residual Plot")
    plt.xlabel("Predicted Solubility logS [mol/L]")
    plt.ylabel("Residuals (Predicted - Actual)")

In [34]:
#Feature importance

def feature_importance(test_regression):

    best_model = test_regression.best_estimator_ #Get trained model with best hyperparametres 
    importances = best_model.feature_importances_ #GB assigns numerical importance to each feature
    X_train_df = pd.DataFrame(X_train, columns=[f"Feature {i}" for i in range(X_train.shape[1])])
    feature_names = X_train_df.columns
    
    feat_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10,6))
    plt.barh(feat_imp_df['Feature'][:20], feat_imp_df['Importance'][:20])
    plt.gca().invert_yaxis()
    plt.xlabel('Importance')
    plt.title('Top 20 Feature Importances')
    plt.show()