<a href="https://colab.research.google.com/github/Madhuanabala/breast-cancer/blob/mol-descriptors-and-fp/cdkextended.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip -o fingerprints_xml.zip

--2025-01-20 10:59:54--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2025-01-20 10:59:54--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’


2025-01-20 10:59:55 (61.3 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFin

In [3]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [4]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [5]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [6]:
df = pd.read_csv('/content/bcfiltered_bioactivity_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1817 entries, 0 to 1816
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   molecule_chembl_id  1817 non-null   object 
 1   canonical_smiles    1817 non-null   object 
 2   bioactivity_class   1817 non-null   object 
 3   MW                  1817 non-null   float64
 4   LogP                1817 non-null   float64
 5   NumHDonors          1817 non-null   float64
 6   NumHAcceptors       1817 non-null   float64
 7   pIC50               1817 non-null   float64
dtypes: float64(5), object(3)
memory usage: 113.7+ KB


In [7]:
df = pd.concat( [df['canonical_smiles'],df['molecule_chembl_id']], axis=1 )
df.to_csv('molecule.smi', sep='\t', index=False, header=False)
df

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,CC/C(=C(/c1ccc(O)cc1)c1ccc(OCCN(C)CCOCCO/N=C/c...,CHEMBL266703
1,CC/C(=C(/c1ccc(O)cc1)c1ccc(OCN(C)C)cc1)c1ccccc...,CHEMBL2145445
2,CC/C(=C(/c1ccc(O)cc1)c1ccc(OCCN(C)CCO/N=C/c2cc...,CHEMBL266185
3,CC/C(=C(/c1ccc(O)cc1)c1ccc(OCN(C)C)cc1)c1ccccc1,CHEMBL19195
4,CC/C(=C(\c1ccccc1)c1ccc(OCCN(C)C)cc1)c1ccccc1,CHEMBL83
...,...,...
1812,CC[C@@]12CCCN3C(=O)C=C4c5ccccc5N(C(=O)CC1)[C@]432,CHEMBL5440580
1813,CC[C@@]12CCCN3C(=O)C(Cl)=C4c5ccccc5N(C(=O)CC1)...,CHEMBL5417545
1814,O=C(NNc1nc2cc(F)ccc2n2cccc12)c1cnccn1,CHEMBL225542
1815,Cc1ccc(Sc2cccc3[nH]c4nc(N)nc(N)c4c23)cc1,CHEMBL1093100


In [8]:
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [9]:
!pip install padelpy

Collecting padelpy
  Downloading padelpy-0.1.16-py3-none-any.whl.metadata (7.7 kB)
Downloading padelpy-0.1.16-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.16


In [11]:
from padelpy import padeldescriptor

fingerprint = 'CDKextended'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #CDKextended.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi',
                d_file=fingerprint_output_file, #'CDKextended.csv'
                #descriptortypes='CDKextendedFingerprintCount.xml',
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [12]:
descriptors = pd.read_csv(fingerprint_output_file)
descriptors

Unnamed: 0,Name,ExtFP1,ExtFP2,ExtFP3,ExtFP4,ExtFP5,ExtFP6,ExtFP7,ExtFP8,ExtFP9,...,ExtFP1015,ExtFP1016,ExtFP1017,ExtFP1018,ExtFP1019,ExtFP1020,ExtFP1021,ExtFP1022,ExtFP1023,ExtFP1024
0,CHEMBL266703,1,0,0,0,1,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0
1,CHEMBL2145445,1,0,0,0,1,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0
2,CHEMBL266185,1,0,0,0,1,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0
3,CHEMBL19195,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL83,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1812,CHEMBL5440580,1,0,1,0,0,1,0,0,1,...,1,1,1,1,0,0,0,0,0,0
1813,CHEMBL5417545,1,0,1,0,0,1,0,0,1,...,1,1,1,1,0,0,0,0,0,0
1814,CHEMBL225542,1,0,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1815,CHEMBL1093100,0,0,1,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
