# **Bioinformatics Project - Computational Drug Discovery [Part 3] Descriptor Calculation and Dataset Preparation**

 **Part 3**: calculate molecular descriptors that are essentially quantitative description of the compounds in the dataset. Finally, preparing this into a dataset for subsequent model building in Part 4.

---

In [None]:
! pip install padelpy


Collecting padelpy
  Downloading padelpy-0.1.16-py3-none-any.whl.metadata (7.7 kB)
Downloading padelpy-0.1.16-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.16


In [None]:
from padelpy import from_smiles

# Example - calculate descriptors for a list of SMILES
descriptors = from_smiles(['CCO','CCC'])
print(descriptors)


[{'nAcid': '0', 'ALogP': '-0.1075999999999997', 'ALogp2': '0.011577759999999935', 'AMR': '12.5551', 'apol': '8.322758', 'naAromAtom': '0', 'nAromBond': '0', 'nAtom': '9', 'nHeavyAtom': '3', 'nH': '6', 'nB': '0', 'nC': '2', 'nN': '0', 'nO': '1', 'nS': '0', 'nP': '0', 'nF': '0.0', 'nCl': '0.0', 'nBr': '0.0', 'nI': '0.0', 'nX': '0.0', 'ATS0m': '550.5926270000002', 'ATS1m': '413.0905419999999', 'ATS2m': '301.12475700000005', 'ATS3m': '68.61657600000001', 'ATS4m': '3.0481920000000002', 'ATS5m': '0.0', 'ATS6m': '0.0', 'ATS7m': '0.0', 'ATS8m': '0.0', 'ATS0v': '1249.9270470485976', 'ATS1v': '1381.9431163032111', 'ATS2v': '1279.5114392735875', 'ATS3v': '609.447463694361', 'ATS4v': '93.25123329279079', 'ATS5v': '0.0', 'ATS6v': '0.0', 'ATS7v': '0.0', 'ATS8v': '0.0', 'ATS0e': '68.74353199999999', 'ATS1e': '62.633728', 'ATS2e': '98.55586799999999', 'ATS3e': '89.27884799999998', 'ATS4e': '20.155392', 'ATS5e': '0.0', 'ATS6e': '0.0', 'ATS7e': '0.0', 'ATS8e': '0.0', 'ATS0p': '8.888681429094001', 'ATS1p

## **Load bioactivity data**

In [None]:
import pandas as pd
df3 = pd.read_csv('acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv')

In [None]:
df3

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,active,312.325,2.8032,0.0,6.0,6.124939
1,1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,active,376.913,4.5546,0.0,5.0,7.000000
2,2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,inactive,426.851,5.3574,0.0,5.0,4.301030
3,3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,active,404.845,4.7069,0.0,5.0,6.522879
4,4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,active,346.334,3.0953,0.0,6.0,6.096910
...,...,...,...,...,...,...,...,...,...
6893,6893,CHEMBL6054495,COc1cc(N)c(I)cc1C(=O)NCC1CCN(CC2CCCCC2)CC1,intermediate,485.410,3.9041,2.0,4.0,5.293282
6894,6894,CHEMBL5755069,Nc1cc(O)c(C(=O)CCC2CCN(CC3CCCCC3)CC2)cc1Cl,active,378.944,4.8830,2.0,4.0,6.386158
6895,6895,CHEMBL5791030,CCOc1cc(N)c(Cl)cc1C(=O)CCC1CCN(CC2CCCCC2)CC1,active,406.998,5.5761,1.0,4.0,6.403403
6896,6896,CHEMBL5799857,Nc1cc(OCCF)c(C(=O)CCC2CCN(CC3CCCCC3)CC2)cc1Cl,active,424.988,5.5257,1.0,4.0,6.204120


In [None]:
from padelpy import from_smiles
import pandas as pd


In [None]:
smiles_list = df3['canonical_smiles'].tolist()


In [None]:
selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df3[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [None]:
from padelpy import padeldescriptor

padeldescriptor(
    mol_dir='molecule.smi',           # your SMILES file
    d_file='descriptors_output.csv',  # output file
    fingerprints=True,                # calculate fingerprints
    retainorder=True,                 # keep same molecule order
    removesalt=True,                  # clean molecules
    standardizenitro=True,             # standardize nitro groups
    maxruntime=2000
)


## **Calculate fingerprint descriptors**


### **Calculate PaDEL descriptors**

## **Preparing the X and Y Data Matrices**

### **X data matrix**

In [None]:
df3_X = pd.read_csv('descriptors_output.csv')

In [None]:
df3_X

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL133897,,,,,,,,,,...,,,,,,,,,,
1,CHEMBL336398,,,,,,,,,,...,,,,,,,,,,
2,CHEMBL131588,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CHEMBL130628,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CHEMBL130478,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6893,CHEMBL6054495,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6894,CHEMBL5755069,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6895,CHEMBL5791030,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6896,CHEMBL5799857,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df3_X = df3_X.drop(columns=['Name'])
df3_X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6893,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6894,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6895,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6896,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Y variable**

### **Convert IC50 to pIC50**

In [None]:
df3_Y = df3['pIC50']
df3_Y

Unnamed: 0,pIC50
0,6.124939
1,7.000000
2,4.301030
3,6.522879
4,6.096910
...,...
6893,5.293282
6894,6.386158
6895,6.403403
6896,6.204120


## **Combining X and Y variable**

In [None]:
dataset3 = pd.concat([df3_X,df3_Y], axis=1)
dataset3

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,,,,,,,,,,,...,,,,,,,,,,6.124939
1,,,,,,,,,,,...,,,,,,,,,,7.000000
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.301030
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.522879
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.096910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6893,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.293282
6894,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.386158
6895,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.403403
6896,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.204120


In [None]:
dataset3.to_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)

In [None]:
df_jian = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

df_jian.info()

df_jian.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6724 entries, 0 to 6723
Columns: 883 entries, Unnamed: 0 to pIC50
dtypes: float64(882), int64(1)
memory usage: 45.3 MB


Unnamed: 0,0
Unnamed: 0,0
PubchemFP0,0
PubchemFP1,0
PubchemFP2,0
PubchemFP3,0
...,...
PubchemFP877,0
PubchemFP878,0
PubchemFP879,0
PubchemFP880,0


In [None]:
df_jian.dropna(inplace=True)

In [None]:
df_jian.to_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv',index=False)

In [None]:
df_jian.isnull().sum()

Unnamed: 0,0
PubchemFP0,0
PubchemFP1,0
PubchemFP2,0
PubchemFP3,0
PubchemFP4,0
...,...
PubchemFP877,0
PubchemFP878,0
PubchemFP879,0
PubchemFP880,0


In [None]:
df_jian.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6724 entries, 2 to 6897
Columns: 882 entries, PubchemFP0 to pIC50
dtypes: float64(882)
memory usage: 45.3 MB


# **Let's download the CSV file to your local computer for the Part 3B (Model Building).**