In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem, Descriptors, MolFromSmiles
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [None]:
# Import main data and get list of SMILES
data = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv")  # Load the photoswitch dataset using pandas
smiles_list = data['smiles'].to_list()

In [None]:
len(smiles_list)

In [None]:
# Initiate list of rdkit molecules
rdkit_mols = [MolFromSmiles(smiles) for smiles in smiles_list]

In [None]:
# Get Morgan fingerprints, note the parameters!
morgan_fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048) for mol in rdkit_mols]
morgan_fingerprints = np.asarray(morgan_fingerprints)

In [None]:
# Turn into pandas dataframe and add smiles as a first column
morgan_fingerprints = pd.DataFrame(data = morgan_fingerprints)
morgan_fingerprints.insert(0, "smiles", smiles_list)

In [None]:
morgan_fingerprints

In [None]:
morgan_fingerprints.to_csv("morgan_fingerprints.csv")

In [None]:
# Next, rdkit's own descriptors
from rdkit.Chem import Descriptors

In [None]:
# A list of desriptors
Descriptors.descList

In [None]:
# Write a dictionary of name:function pairs for all descriptors
all_descriptors = {d[0]: d[1] for d in Descriptors.descList}

In [None]:
# Initialise a new pandas df
rdkit_descriptors = pd.DataFrame(data = {"smiles": np.array((smiles_list)) })
rdkit_descriptors

In [None]:
# Compute each descriptor (outer loop) for each molecule(inside)
for feature in all_descriptors:
    values = []
    for mol in rdkit_mols:
        values += [all_descriptors[feature](mol)]
    rdkit_descriptors[feature] = values

rdkit_descriptors

In [None]:
rdkit_descriptors.to_csv("rdkit_descriptors.csv")

In [None]:
!pip install mordred

In [None]:
# Finally, mordred descriptors
from mordred import Calculator, descriptors, error

In [None]:
# Initialise a calculator -- mordred works weirdly this way...
calc = Calculator(descriptors)

In [None]:
# Wow, many descriptors, much wow
len(calc.descriptors)

In [None]:
mordred_descriptors = calc.pandas(rdkit_mols)

In [None]:
# It seems that unfortunately some descriptors cannot be computed. To filter this, 
# we find all columns that are of data type "object", since those contain non-numerical values usually.

#CAUTION this takes a long time to run and generates a very large file!
error_columns = []
for i, e in enumerate(mordred_descriptors.dtypes):
    if e=="object":
        error_columns += [i]
error_columns

In [None]:
# use .drop to remove the affected columns 
mordred_descriptors = mordred_descriptors.drop(mordred_descriptors.columns[error_columns], axis=1)

In [None]:
# and remove columns containing NA data, but I don't think this actually does anything...
mordred_descriptors = mordred_descriptors.dropna(axis=1)

In [None]:
# again, insert first SMILES column
mordred_descriptors.insert(0, "smiles", smiles_list)
mordred_descriptors

In [None]:
mordred_descriptors.to_csv("mordred_descriptors.csv")

In [None]:
# finally, generate images of molecules
#CAREFUL this will generate a very very large file!!
from rdkit.Chem import Draw
for i,mol in enumerate(rdkit_mols):
    Draw.MolToFile(mol, filename = "molecule_images/"+ str(i) + ".png")