Drug fingerprints and descriptors were generated from [RDKit](https://www.rdkit.org/docs/index.html) using the 2D structures of 104 FDA-approved drugs obtained from the NCI-ALMANAC ComboCompoundSet.sdf file. However, molecules 40 and 57 were found to be invalid, resulting in a total of **102** valid molecules.

In [1]:
# Read the Compound 2D structure: ComboCompoundSet.sdf file
from rdkit import Chem
from rdkit.Chem import AllChem

# Path to your SDF file
sdf_file_path = "/nfs/turbo/med-kayvan-lab/Projects/DrugCombination/b-DrugCombination/DC_Data/NCI-ALMANAC/ComboCompoundSet.sdf"

# Read the SDF file
suppl = Chem.SDMolSupplier(sdf_file_path)

# Counter for valid molecules
valid_molecules_count = 0

# Create an empty list
ms = []

# Iterate over the molecules in the SDF file
for idx, mol in enumerate(suppl, start=1):
    try:
        # Try to sanitize the molecule
        Chem.SanitizeMol(mol)
        # If successful, process the molecule
        if mol is not None:
            # Print the molecule's SMILES representation
            ms.append(Chem.MolToSmiles(mol))
            valid_molecules_count += 1
    except Exception as e:
        # If an error occurs during sanitization, skip the molecule and print the error
        print(f"Error processing molecule {idx}: {e}")

print(len(ms))

Error processing molecule 40: Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
Error processing molecule 57: Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(RDKit::ROMol {lvalue} mol, unsigned long sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)
102


[16:04:21] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[16:04:21] ERROR: Could not sanitize molecule ending on line 6200
[16:04:21] ERROR: Explicit valence for atom # 0 Cl, 2, is greater than permitted
[16:04:21] Explicit valence for atom # 3 O, 3, is greater than permitted
[16:04:21] ERROR: Could not sanitize molecule ending on line 8652
[16:04:21] ERROR: Explicit valence for atom # 3 O, 3, is greater than permitted


### 1. Fingerprints

1.1 RDKit (Topological) Fingerprints

In [2]:
# Create a FingeprintGenerator object for the fingerprint type of interest
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

fpgen = rdFingerprintGenerator.GetRDKitFPGenerator()

# Convert SMILES strings to RDKit molecules
mols = [Chem.MolFromSmiles(smiles) for smiles in ms]

# Generate fingerprints for each molecule
fps = [fpgen.GetFingerprint(mol) for mol in mols]

print(len(fps))
print(len(fps[0]))
print(len(fps[1]))

102
2048
2048


In [3]:
# Calculate the similarity between two fingerprints
from rdkit import DataStructs

# The examples below used Tanimoto similarity, but one can use different similarity metrics:
similarity = DataStructs.TanimotoSimilarity(fps[0], fps[1])
print(similarity)

# # Available similarity metrics include Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky.
# DataStructs.DiceSimilarity(fps[0],fps[1])

0.0446168768186227


1.2 Morgan Fingerprints (Circular Fingerprints)

In [4]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
import rdkit
print(rdkit.__version__)
# %pylab inline

# Convert SMILES strings to RDKit molecules
mols = [Chem.MolFromSmiles(smiles) for smiles in ms]

# Create a Morgan FingeprintGenerator object for the fingerprint type of interest
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)

# bit vectors:
fps = [mfpgen.GetFingerprint(mol) for mol in mols]   # returns a bit vector of size fpSize
print(f'fp: {type(fps)} {len(fps)}')
print(f'fp: {type(fps[0])} {len(fps[0])}')

sfps = [mfpgen.GetSparseFingerprint(mol) for mol in mols]   # returns a sparse bit vector
print(f'sfp: {type(sfps)} {len(sfps)}')
print(f'sfp: {type(sfps[0])} {len(sfps[0])}')

# count vectors:
cfps = [mfpgen.GetCountFingerprint(mol) for mol in mols]   # returns a count vector of size fpSize
print(f'cfp: {type(cfps)} {len(cfps)}')
print(f'cfp: {type(cfps[0])} {cfps[0].GetLength()}')

scfps = [mfpgen.GetSparseCountFingerprint(mol) for mol in mols]   # returns a sparse count vector
print(f'scfp: {type(scfps)} {len(scfps)}')
print(f'scfp: {type(scfps[0])} {scfps[0].GetLength()}')

2022.03.5
fp: <class 'list'> 102
fp: <class 'rdkit.DataStructs.cDataStructs.ExplicitBitVect'> 2048
sfp: <class 'list'> 102
sfp: <class 'rdkit.DataStructs.cDataStructs.SparseBitVect'> 4294967295
cfp: <class 'list'> 102
cfp: <class 'rdkit.DataStructs.cDataStructs.UIntSparseIntVect'> 2048
scfp: <class 'list'> 102
scfp: <class 'rdkit.DataStructs.cDataStructs.ULongSparseIntVect'> 18446744073709551615


In [5]:
import numpy as np
import pandas as pd

# Convert the list of ExplicitBitVect objects into a 2D NumPy array
array_2d = np.array([np.array(v) for v in fps])

# Reshape the array to have 102 rows and 2048 columns
reshaped_array = array_2d.reshape(102, 2048)

# Read the NSC_DrugName_SMILES.csv file to get 'NSC' values
nsc_df = pd.read_csv('NSC_DrugName_SMILES.csv')

# Set 'NSC' column as index
nsc_df.set_index('NSC', inplace=True)

# Create a DataFrame from the reshaped array with 'NSC' values as index
df = pd.DataFrame(reshaped_array, index=nsc_df.index)

# Optionally, you can set column names if needed, starting from 1
df.columns = [f'mfp_bv_{i+1}' for i in range(2048)]

print(df.info())
print(df.head())
# print(df.tail())

# Save the DataFrame to a CSV file
df.to_csv('mfp_bv.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 102 entries, 740 to 763371
Columns: 2048 entries, mfp_bv_1 to mfp_bv_2048
dtypes: int64(2048)
memory usage: 1.6 MB
None
     mfp_bv_1  mfp_bv_2  mfp_bv_3  mfp_bv_4  mfp_bv_5  mfp_bv_6  mfp_bv_7  \
NSC                                                                         
740         0         1         0         0         0         0         0   
750         0         0         0         0         0         0         0   
752         0         0         0         0         0         0         0   
755         0         0         0         0         0         0         0   
762         0         0         0         0         0         0         0   

     mfp_bv_8  mfp_bv_9  mfp_bv_10  ...  mfp_bv_2039  mfp_bv_2040  \
NSC                                 ...                             
740         0         0          0  ...            0            0   
750         0         0          0  ...            0            0   
752         0      

In [6]:
import numpy as np
import pandas as pd

# Iterate over the elements of the UIntSparseIntVect object
lss = []

for i in range(len(cfps)):
    ls = []  # Initialize ls as an empty list for each cfps
    for j in range(cfps[i].GetLength()):
        # Append each element to the list
        ls.append(cfps[i][j])
    lss.append(ls)

# Convert the list of ExplicitBitVect objects into a 2D NumPy array
array_2d = np.array([np.array(v) for v in lss])

# Reshape the array to have 102 rows and 2048 columns
reshaped_array = array_2d.reshape(102, 2048)

# Read the NSC_DrugName_SMILES.csv file to get 'NSC' values
nsc_df = pd.read_csv('NSC_DrugName_SMILES.csv')

# Set 'NSC' column as index
nsc_df.set_index('NSC', inplace=True)

# Create a DataFrame from the reshaped array with 'NSC' values as index
df = pd.DataFrame(reshaped_array, index=nsc_df.index)

# Optionally, you can set column names if needed, starting from 1
df.columns = [f'mfp_cv_{i+1}' for i in range(2048)]

print(df.info())
print(df.head())
# print(df.tail())

# Save the DataFrame to a CSV file
df.to_csv('mfp_cv.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 102 entries, 740 to 763371
Columns: 2048 entries, mfp_cv_1 to mfp_cv_2048
dtypes: int64(2048)
memory usage: 1.6 MB
None
     mfp_cv_1  mfp_cv_2  mfp_cv_3  mfp_cv_4  mfp_cv_5  mfp_cv_6  mfp_cv_7  \
NSC                                                                         
740         0         1         0         0         0         0         0   
750         0         0         0         0         0         0         0   
752         0         0         0         0         0         0         0   
755         0         0         0         0         0         0         0   
762         0         0         0         0         0         0         0   

     mfp_cv_8  mfp_cv_9  mfp_cv_10  ...  mfp_cv_2039  mfp_cv_2040  \
NSC                                 ...                             
740         0         0          0  ...            0            0   
750         0         0          0  ...            0            0   
752         0      

1.3 MACCS Keys: there is a SMARTS-based implementation of the 166 public MACCS keys.

In [7]:
from rdkit.Chem import MACCSkeys

# Convert SMILES strings to RDKit molecules
mols = [Chem.MolFromSmiles(smiles) for smiles in ms]

fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
print(len(fps))
print(len(fps[0]))
print(len(fps[1]))

102
167
167


In [8]:
import numpy as np
import pandas as pd

# Convert the list of ExplicitBitVect objects into a 2D NumPy array
array_2d = np.array([np.array(v) for v in fps])

# Reshape the array to have 102 rows and 2048 columns
reshaped_array = array_2d.reshape(102, 167)

# Read the NSC_DrugName_SMILES.csv file to get 'NSC' values
nsc_df = pd.read_csv('NSC_DrugName_SMILES.csv')

# Set 'NSC' column as index
nsc_df.set_index('NSC', inplace=True)

# Create a DataFrame from the reshaped array with 'NSC' values as index
df = pd.DataFrame(reshaped_array, index=nsc_df.index)

# Optionally, you can set column names if needed, starting from 1
df.columns = [f'maccsfp_{i+1}' for i in range(167)]

print(df.info())
print(df.head())
# print(df.tail())

# Save the DataFrame to a CSV file
df.to_csv('maccsfp.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 102 entries, 740 to 763371
Columns: 167 entries, maccsfp_1 to maccsfp_167
dtypes: int64(167)
memory usage: 133.9 KB
None
     maccsfp_1  maccsfp_2  maccsfp_3  maccsfp_4  maccsfp_5  maccsfp_6  \
NSC                                                                     
740          0          0          0          0          0          0   
750          0          0          0          0          0          0   
752          0          0          0          0          0          0   
755          0          0          0          0          0          0   
762          0          0          0          0          0          0   

     maccsfp_7  maccsfp_8  maccsfp_9  maccsfp_10  ...  maccsfp_158  \
NSC                                               ...                
740          0          0          0           0  ...            1   
750          0          0          0           0  ...            1   
752          0          0          0      

### 2. Descriptors

In [9]:
# This is the code from https://greglandrum.github.io/rdkit-blog/posts/2022-12-23-descriptor-tutorial.html
from rdkit import Chem
import rdkit
from rdkit.Chem import Descriptors
# rdkit.__version__

def getMolDescriptors(mol, missingVal=None):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

# Convert SMILES strings to RDKit molecules
mols = [Chem.MolFromSmiles(smiles) for smiles in ms]

descs = [getMolDescriptors(m) for m in mols]
print(len(descs))
print(len(descs[0]))
print(len(descs[1]))

102
208
208


In [10]:
import pandas as pd

# Read the NSC_DrugName_SMILES.csv file to get 'NSC' values
nsc_df = pd.read_csv('NSC_DrugName_SMILES.csv')

# Set 'NSC' column as index
nsc_df.set_index('NSC', inplace=True)

# Create a DataFrame from the reshaped array with 'NSC' values as index
df = pd.DataFrame(descs, index=nsc_df.index)

print(df.info())
print(df.head())
# print(df.tail())

# Save the DataFrame to a CSV file
df.to_csv('descriptors.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 102 entries, 740 to 763371
Columns: 208 entries, MaxEStateIndex to fr_urea
dtypes: float64(104), int64(104)
memory usage: 166.5 KB
None
     MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
NSC                                                                         
740       12.370387       -1.299841          12.370387           0.013755   
750       10.473841       -3.398778          10.473841           0.047575   
752        5.396667        0.286667           5.396667           0.286667   
755        4.905046        0.547454           4.905046           0.547454   
762        5.447377        0.000000           5.447377           0.000000   

          qed    MolWt  HeavyAtomMolWt  ExactMolWt  NumValenceElectrons  \
NSC                                                                       
740  0.294720  454.447         432.271  454.171316                  172   
750  0.453313  246.306         232.194  246.023180    

In [7]:
# Concatenate four dataframes, make it as "102DrugDescriptor.csv"

import pandas as pd

# Read the four CSV files
df_mfp_bv = pd.read_csv("mfp_bv.csv", index_col=0)
df_mfp_cv = pd.read_csv("mfp_cv.csv", index_col=0)
df_maccsfp = pd.read_csv("maccsfp.csv", index_col=0)
df_descriptors = pd.read_csv("descriptors.csv", index_col=0)

# Concatenate the dataframes along the columns axis
concatenated_df = pd.concat([df_mfp_bv, df_mfp_cv, df_maccsfp, df_descriptors], axis=1)

# Display information about the concatenated dataframe
print(concatenated_df.shape)
print(concatenated_df.head())

# Save the dataframe to a new CSV file
concatenated_df.to_csv("102DrugDescriptor.csv")

(102, 4471)
     mfp_bv_1  mfp_bv_2  mfp_bv_3  mfp_bv_4  mfp_bv_5  mfp_bv_6  mfp_bv_7  \
NSC                                                                         
740         0         1         0         0         0         0         0   
750         0         0         0         0         0         0         0   
752         0         0         0         0         0         0         0   
755         0         0         0         0         0         0         0   
762         0         0         0         0         0         0         0   

     mfp_bv_8  mfp_bv_9  mfp_bv_10  ...  fr_sulfide  fr_sulfonamd  fr_sulfone  \
NSC                                 ...                                         
740         0         0          0  ...           0             0           0   
750         0         0          0  ...           0             0           0   
752         0         0          0  ...           0             0           0   
755         0         0          0  ...    