#**Calculating molecule fingerprint descriptors**

In [1]:
## Tasks to be performed
## Step 1:Install the Padelpy library
## Step 2: Prepare the fingerprint.xml file
## Step 3: Import the treated dataset
## Step 4: Prepare the data subset for input into padelpy
## Step 5: Calculate fingerprint descriptors
## Step 6: View the calculated descriptors
## Step 7: Save the dataset

### **Step 1:Install the Padelpy library**

In [2]:
! pip install padelpy

Collecting padelpy
  Downloading padelpy-0.1.14-py2.py3-none-any.whl.metadata (7.7 kB)
Downloading padelpy-0.1.14-py2.py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.14


###**Step 2: Prepare the fingerprint.xml file**

In [3]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

--2024-12-21 07:39:56--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2024-12-21 07:39:56--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’


2024-12-21 07:39:56 (53.3 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFin

### **2.1. Create a list and organize xml files**

In [4]:
import glob
arquivos_xml = glob.glob("*.xml")
arquivos_xml.sort()
arquivos_xml

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [5]:
lista_FP = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

###**Creating a dictionary**

In [6]:

fp = dict(zip(lista_FP, arquivos_xml))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [7]:
fp['PubChem']

'PubchemFingerprinter.xml'

###**## Step 3: Import the treated dataset**

In [8]:
from google.colab import files
ploaded = files.upload()

Saving PART0 3 LEPRA_3classes.csv to PART0 3 LEPRA_3classes.csv


In [9]:
import pandas as pd
df = pd.read_csv("PART0 3 LEPRA_3classes.csv")
df

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL320553,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2...,Active,457.526,5.73632,1.0,5.0,7.853872
1,1,CHEMBL149676,Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC(C)(Oc2ccccc2)C(...,Intermediate,457.526,5.73632,1.0,5.0,7.744727
2,2,CHEMBL344282,Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC(Oc2ccccc2)C(=O)...,Intermediate,443.499,5.34622,1.0,5.0,6.892790
3,3,CHEMBL278590,Cc1oc(C2CCCCC2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2)...,Active,463.574,6.11702,1.0,5.0,7.823909
4,4,CHEMBL424133,Cc1oc(-c2cccs2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2)...,Intermediate,463.555,5.79782,1.0,6.0,8.000000
...,...,...,...,...,...,...,...,...,...
1892,1892,CHEMBL278501,COc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(C...,Inactive,710.861,5.30690,1.0,11.0,6.847712
1893,1893,CHEMBL265334,CCOc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(...,Inactive,724.888,5.69700,1.0,11.0,7.468521
1894,1894,CHEMBL16428,CCOc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(...,Inactive,724.888,5.69700,1.0,11.0,7.096910
1895,1895,CHEMBL360583,COc1cccc(CCC2(O)C(C)=C[C@@H](OC(C)=O)[C@@]3(C)...,Intermediate,710.861,5.30690,1.0,11.0,7.619789


##**Step 4: Prepare the data subset for input into padelpy**

In [10]:
df2 = pd.concat( [df['canonical_smiles'],df['molecule_chembl_id']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,Cc1oc(-c2ccccc2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2...,CHEMBL320553
1,Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC(C)(Oc2ccccc2)C(...,CHEMBL149676
2,Cc1oc(-c2ccccc2)nc1CCOc1ccc(CC(Oc2ccccc2)C(=O)...,CHEMBL344282
3,Cc1oc(C2CCCCC2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2)...,CHEMBL278590
4,Cc1oc(-c2cccs2)nc1CCOc1ccc(C[C@](C)(Oc2ccccc2)...,CHEMBL424133
...,...,...
1892,COc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(C...,CHEMBL278501
1893,CCOc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(...,CHEMBL265334
1894,CCOc1ccccc1CCC1(O)C(C)=C[C@@H](OC(C)=O)[C@@]2(...,CHEMBL16428
1895,COc1cccc(CCC2(O)C(C)=C[C@@H](OC(C)=O)[C@@]3(C)...,CHEMBL360583


##**Step 5: Calculate fingerprint descriptors**

In [11]:
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [12]:
# Calculating the descriptors. In this case I chose Pubchem


from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Pubchem.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi',
                d_file=fingerprint_output_file, #'Pubchem.csv'
                #descriptortypes='PubChemFingerprint.xml',
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

##**Step 6: View the calculated descriptors**

In [13]:
descritores = pd.read_csv(fingerprint_output_file)
descritores

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL320553,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL149676,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL344282,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL278590,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL424133,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1892,CHEMBL278501,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1893,CHEMBL265334,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1894,CHEMBL16428,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1895,CHEMBL360583,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **Step 7: Save the dataset**

In [14]:
descritores.to_csv("eNOS-pubchem_LEPRA.csv")