<a href="https://colab.research.google.com/github/Madhuanabala/breast-cancer/blob/mol-descriptors-and-fp/RDkit_discriptors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [2]:
!pip install mordred

Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176718 sha256=00064e2bca231b0b2f247b6a91509b042c40f737d2bca7e6ffae9834bcfd66a6
  Stored in directory: /root/.cache/pip/wheels/8b/30/0b/84e3f6775306e74cf5957ee4d16b10bf3927dcec44cc23d5f2
Successfully built mordred
Installing collected packages: networkx, mordred
  Attempting uninstall: networ

In [3]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [4]:
dataset=pd.read_csv('/content/bcfiltered_bioactivity_data.csv')

In [5]:
df=dataset.dropna(subset=['canonical_smiles'])

In [6]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL266703,CC/C(=C(/c1ccc(O)cc1)c1ccc(OCCN(C)CCOCCO/N=C/c...,active,1179.286,6.17790,9.0,20.0,7.397940
1,CHEMBL2145445,CC/C(=C(/c1ccc(O)cc1)c1ccc(OCN(C)C)cc1)c1ccccc...,active,917.021,5.66050,7.0,15.0,7.000000
2,CHEMBL266185,CC/C(=C(/c1ccc(O)cc1)c1ccc(OCCN(C)CCO/N=C/c2cc...,active,1135.233,6.16130,9.0,19.0,7.221849
3,CHEMBL19195,CC/C(=C(/c1ccc(O)cc1)c1ccc(OCN(C)C)cc1)c1ccccc1,inactive,373.496,5.65920,1.0,3.0,4.698970
4,CHEMBL83,CC/C(=C(\c1ccccc1)c1ccc(OCCN(C)C)cc1)c1ccccc1,active,371.524,5.99610,0.0,2.0,7.301030
...,...,...,...,...,...,...,...,...
1812,CHEMBL5440580,CC[C@@]12CCCN3C(=O)C=C4c5ccccc5N(C(=O)CC1)[C@]432,inactive,308.381,2.93910,0.0,2.0,5.000000
1813,CHEMBL5417545,CC[C@@]12CCCN3C(=O)C(Cl)=C4c5ccccc5N(C(=O)CC1)...,inactive,342.826,3.50560,0.0,2.0,5.000000
1814,CHEMBL225542,O=C(NNc1nc2cc(F)ccc2n2cccc12)c1cnccn1,active,322.303,2.17350,2.0,6.0,6.397940
1815,CHEMBL1093100,Cc1ccc(Sc2cccc3[nH]c4nc(N)nc(N)c4c23)cc1,inactive,321.409,3.73512,3.0,5.0,4.924453


In [7]:
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors = []
    for mol in mols:
        # add hydrogens to molecules
        mol = Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors, desc_names

# Split the SMILES into chunks of 100,000 for faster processing
chunk_size = 100000
chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]

total_chunks=len(chunks)
total_time=0

# Check if there is an existing output file
if os.path.isfile('RDkit_descriptors.csv'):
    existing_data = pd.read_csv('RDkit_descriptors.csv', index_col=0)
else:
    existing_data = pd.DataFrame()
    # Calculate descriptors for each chunk and concatenate the results
for i, chunk in enumerate(tqdm(chunks, desc='Processing', total=len(chunks))):
    # Check if this chunk has already been processed
    if len(existing_data) >= len(chunk):
        continue
    # Calculate descriptors for this chunk
    descriptors, desc_names = RDkit_descriptors(chunk['canonical_smiles'])
    # Convert the descriptors to a dataframe
    df_with_200_descriptors = pd.DataFrame(descriptors, columns=desc_names,)
    # Add the chunk index as a new column
    df_with_200_descriptors['chunk_index'] = i
    # Append the data to the existing data
    existing_data = pd.concat([existing_data, df_with_200_descriptors], axis=0)
    # Save the data after each chunk
    existing_data.to_csv('RDkit_descriptors.csv')

# Save the final data
existing_data.to_csv('RDkit_descriptors.csv')



Processing: 100%|██████████| 1/1 [02:03<00:00, 123.20s/it]
