<a href="https://colab.research.google.com/github/KarimeZeraik/QSAR-and-ML/blob/main/QSAR_Trypanosoma_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Calculating molecule fingerprint descriptors

## Install the Padelpy library

In [None]:
! pip install padelpy

Collecting padelpy
  Downloading padelpy-0.1.14-py2.py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.14


## Prepare the fingerprint.xml file

In [None]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

--2024-06-16 18:55:39--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2024-06-16 18:55:39--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’


2024-06-16 18:55:39 (56.9 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFin

## Create a list and organize xml files

In [None]:
import glob
arquivos_xml = glob.glob("*.xml")
arquivos_xml.sort()
arquivos_xml

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [None]:
lista_FP = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

### Creating a dictionary

In [None]:

fp = dict(zip(lista_FP, arquivos_xml))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [None]:
fp['PubChem']

'PubchemFingerprinter.xml'

### Import the treated dataset

In [None]:
from google.colab import files
ploaded = files.upload()

Saving dataset_3classes.csv to dataset_3classes.csv


In [None]:
import pandas as pd
df = pd.read_csv("dataset_3classes.csv")
display(df.head())

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,classe_bioatividade,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,0,CHEMBL55,N=C(N)c1ccc(OCCCCCOc2ccc(C(=N)N)cc2)cc1,Active,340.427,2.88284,4.0,4.0,7.455932
1,1,1,CHEMBL9126,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,Active,322.416,1.58134,4.0,4.0,7.236572
2,2,2,CHEMBL9126,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,Active,322.416,1.58134,4.0,4.0,7.79588
3,3,3,CHEMBL9126,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,Active,322.416,1.58134,4.0,4.0,7.782516
4,4,4,CHEMBL267154,C(=C1/CCCN(c2ccc(C3=NCCN3)cc2)C1)\c1ccc(C2=NCC...,Active,385.515,3.07,2.0,5.0,7.148742


### Prepare the data subset for input into padelpy

In [None]:
df2 = pd.concat( [df['canonical_smiles'],df['molecule_chembl_id']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,N=C(N)c1ccc(OCCCCCOc2ccc(C(=N)N)cc2)cc1,CHEMBL55
1,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,CHEMBL9126
2,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,CHEMBL9126
3,N=C(N)c1ccc(N2CCN(c3ccc(C(=N)N)cc3)CC2)cc1,CHEMBL9126
4,C(=C1/CCCN(c2ccc(C3=NCCN3)cc2)C1)\c1ccc(C2=NCC...,CHEMBL267154
...,...,...
21603,O=C(Cc1ccc(O)c(O)c1)NCCCCNCCCNC(=O)Cc1ccc(O)c(...,CHEMBL2314239
21604,C(CCCNCCCNCC1CCCCCC1)CCCNCCCNCC1CCCCCC1,CHEMBL41629
21605,CN(C)CCCNc1cc(Cl)ccc1Sc1ccc(C(F)(F)F)cc1,CHEMBL5274443
21606,Br.C[N+](C)(CCCNc1cc(Cl)ccc1Sc1ccccc1)Cc1ccc([...,CHEMBL5285392


###**Calculate fingerprint descriptors**

In [None]:
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [None]:
# Calculating the descriptors. In this case I chose Pubchem


from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #PubChem.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi',
                d_file=fingerprint_output_file, #'Pubchem.csv'
                #descriptortypes='PubChemFingerprint.xml',
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

### View the calculated descriptors

In [None]:
descritores = pd.read_csv(fingerprint_output_file)
descritores

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL55,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL9126,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL9126,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL9126,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL267154,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21603,CHEMBL2314239,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21604,CHEMBL41629,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21605,CHEMBL5274443,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21606,CHEMBL5285392,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Save the dataset

In [None]:
descritores.to_csv("pubchem_descritores_3classes.csv")