In [None]:
### **Descriptor Calculation and Dataset Preparation**

Ngceboyakwethu Primrose Zinyama


In **this notebook**, I will be calculating molecular descriptors that are essentially quantitative description of the compounds in the dataset. Descriptors calculated will be cleaned by removing redundant descriptors. Finally, I will be preparing this into a dataset for subsequent model building. 

### **Loading the preprocessed compounds csv file**

In [1]:
import pandas as pd

file_path = "C:\jupiter\Machine learning\preprocessed_nct.csv"  
df = pd.read_csv(file_path)
df.head() #Displaying the first 5 rows

  file_path = "C:\jupiter\Machine learning\preprocessed_nct.csv"


Unnamed: 0,CID,CanonicalSMILES,bioactivity_class,pIC50,Canonical_SMILES,Valid_Molecule,Neutralized_SMILES,Valid_SMILES
0,56681654,CCC(C)C(C(=O)OC)NC(=O)C(C(C)C)NC(=O)C(CC(C)C)N...,inactive,6.568636,CCC(C)C(NC(=O)C(NC(=O)C(CC(C)C)NC(=O)N(Cc1cccc...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF8...,[H]OC([H])(C([H])([H])N(C(=O)N([H])C([H])(C(=O...,CCC(C)C(C(=O)OC)NC(=O)C(C(C)C)NC(=O)C(CC(C)C)N...
1,44386767,CC(C)CC(C(=O)NC(C(C)C)C(=O)NC(CC1=CC=CC=C1)C(=...,inactive,6.60206,COC(=O)C(Cc1ccccc1)NC(=O)C(NC(=O)C(CC(C)C)NC(=...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF8...,[H]OC([H])(C([H])([H])N(C(=O)N([H])C([H])(C(=O...,CC(C)CC(C(=O)NC(C(C)C)C(=O)NC(CC1=CC=CC=C1)C(=...
2,15344717,CC(C)CC(C(=O)NC(CC1=CC=CC=C1)C(=O)OC)NC(=O)N(C...,inactive,5.508638,COC(=O)C(Cc1ccccc1)NC(=O)C(CC(C)C)NC(=O)N(C)CC...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF8...,[H]OC([H])(C([H])([H])N(C(=O)N([H])C([H])(C(=O...,CC(C)CC(C(=O)NC(CC1=CC=CC=C1)C(=O)OC)NC(=O)N(C...
3,12147040,CC(C)CC(C(=O)NC(CC(C)C)C(=O)OC)NC(=O)N(CC1=CC=...,inactive,6.187087,COC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=O)N(Cc1cccc...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF8...,[H]OC([H])(C([H])([H])N(C(=O)N([H])C([H])(C(=O...,CC(C)CC(C(=O)NC(CC(C)C)C(=O)OC)NC(=O)N(CC1=CC=...
4,44386506,CC(C)CC(C(=O)NC(C(C)C)C(=O)OC)NC(=O)N(CC1=CC=C...,inactive,6.638272,COC(=O)C(NC(=O)C(CC(C)C)NC(=O)N(Cc1ccccc1)CC(O...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF8...,[H]OC([H])(C([H])([H])N(C(=O)N([H])C([H])(C(=O...,CC(C)CC(C(=O)NC(C(C)C)C(=O)OC)NC(=O)N(CC1=CC=C...


A .smi file is required for descriptor calculation and the smiles are extracted from the uploaded .csv file.

In [None]:
smiles_column = "CanonicalSMILES" 
name_column = "CID" 

with open("molecules.smi", "w") as f: # Write to a .smi file
    f.write(f"{smiles_column} {name_column}\n")   # Write the header first
    
    for index, row in df.iterrows(): # Write the molecule data
        smiles = row[smiles_column]
        name = row.get(name_column, f"Mol{index}")  # Default name if missing
        f.write(f"{smiles} {name}\n")

##Install & Run PaDEL using padelpy for ease of calculating descriptors**

In [None]:
!pip install padelpy

### **Calculating 2d and 3d descriptors**

In [None]:
from padelpy import padeldescriptor

In [None]:
padeldescriptor(mol_dir='molecules.smi', d_file='descriptors.csv', d_2d=True, d_3d=False, fingerprints=False, retainorder=True #convert3d=True, #detectaromaticity=True, removesalt=True, retain3d=True
                # standardizenitro=True, standardizetautomers=True
                )

### **Removal of redundant descriptors**

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("descriptors.csv")

# assume first column is molecule names or IDs
names = df.iloc[:, 0]
desc = df.iloc[:, 1:]

# drop columns with no variance or >95% zero values
no_var = desc.std() == 0
mostly_zero = (desc == 0).mean() > 0.95
desc = desc.loc[:, ~(no_var | mostly_zero)]

# drop columns with super low relative std dev
rsd = desc.std() / desc.mean().replace(0, np.nan)
desc = desc.loc[:, ~(rsd < 0.001)]

# stick names back on and save
cleaned = pd.concat([names, desc], axis=1)
cleaned.to_csv("descriptors_cleaned.csv", index=False)

print(f"{df.shape[1]-1} descriptors in, {desc.shape[1]} out.")


1444 descriptors in, 1000 out.


## **Preparing the X and Y Data Matrices in preparation for model building**

Descriptors are required in a pandas.DataFrame (X) and the target labels in a pandas.Series (y).

### **X data matrix**

In [2]:
import pandas as pd
df_X = pd.read_csv('descriptors_cleaned.csv')

In [3]:
df_X

Unnamed: 0,Name,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,46225361,3.5362,12.504710,79.6864,73.690583,6,6,66,35,31,...,8.078782,68.408310,1.954523,30.832501,4.811796,3.569673,3494.0,54.0,8.663,184.0
1,24964343,3.5362,12.504710,79.6864,73.690583,6,6,66,35,31,...,8.078782,68.394788,1.954137,30.860491,4.800371,3.564928,3540.0,55.0,8.663,184.0
2,10434562,0.0442,0.001954,53.2682,60.177860,12,12,49,29,20,...,9.449980,57.915800,1.997097,27.112560,9.771265,2.966673,2060.0,52.0,4.479,158.0
3,45487151,-1.2688,1.609853,57.5675,53.323481,11,11,43,26,17,...,9.675302,53.124389,2.043246,32.364979,9.832488,13.182591,1486.0,48.0,1.238,150.0
4,25166594,-0.3239,0.104911,69.1925,70.015239,12,12,60,37,23,...,9.501342,74.488298,2.013197,38.103828,19.085290,0.000000,4044.0,70.0,4.643,206.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,46215514,0.2383,0.056787,63.6693,71.016618,17,18,61,35,26,...,7.872087,72.271825,2.064909,26.684300,8.475776,13.147208,3798.0,62.0,5.437,190.0
949,46214507,-0.4545,0.206570,60.0789,70.434204,17,18,60,32,28,...,7.137020,66.727842,2.085245,19.060118,5.931073,13.129045,3145.0,54.0,7.441,172.0
950,71461363,0.7826,0.612463,38.1019,52.952688,17,18,42,26,16,...,8.692693,53.521251,2.058510,17.463850,5.401800,9.516605,1718.0,41.0,4.073,142.0
951,49865787,-0.1795,0.032220,64.0521,70.757618,22,23,60,34,26,...,7.653451,70.136479,2.062838,24.573258,5.436347,16.617795,3537.0,58.0,5.663,186.0


In [13]:
df_X = df_X.drop(columns=['Name'])
df_X

Unnamed: 0,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nC,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,3.5362,12.504710,79.6864,73.690583,6,6,66,35,31,23,...,8.078782,68.408310,1.954523,30.832501,4.811796,3.569673,3494.0,54.0,8.663,184.0
1,3.5362,12.504710,79.6864,73.690583,6,6,66,35,31,23,...,8.078782,68.394788,1.954137,30.860491,4.800371,3.564928,3540.0,55.0,8.663,184.0
2,0.0442,0.001954,53.2682,60.177860,12,12,49,29,20,19,...,9.449980,57.915800,1.997097,27.112560,9.771265,2.966673,2060.0,52.0,4.479,158.0
3,-1.2688,1.609853,57.5675,53.323481,11,11,43,26,17,15,...,9.675302,53.124389,2.043246,32.364979,9.832488,13.182591,1486.0,48.0,1.238,150.0
4,-0.3239,0.104911,69.1925,70.015239,12,12,60,37,23,23,...,9.501342,74.488298,2.013197,38.103828,19.085290,0.000000,4044.0,70.0,4.643,206.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,0.2383,0.056787,63.6693,71.016618,17,18,61,35,26,26,...,7.872087,72.271825,2.064909,26.684300,8.475776,13.147208,3798.0,62.0,5.437,190.0
949,-0.4545,0.206570,60.0789,70.434204,17,18,60,32,28,26,...,7.137020,66.727842,2.085245,19.060118,5.931073,13.129045,3145.0,54.0,7.441,172.0
950,0.7826,0.612463,38.1019,52.952688,17,18,42,26,16,20,...,8.692693,53.521251,2.058510,17.463850,5.401800,9.516605,1718.0,41.0,4.073,142.0
951,-0.1795,0.032220,64.0521,70.757618,22,23,60,34,26,26,...,7.653451,70.136479,2.062838,24.573258,5.436347,16.617795,3537.0,58.0,5.663,186.0


### **Y data matrix**

To extract activity values from preprocessed_nicastrin.csv file using the CIDs from the molecules.smi file. Not all compounds in the preprocessed file are in molecules.smi, large molecules presented challenges during the calculation and were dropped.

In [4]:
import pandas as pd

smi_df = pd.read_csv('molecules.smi', sep='\t', header=None, names=['SMILES', 'CID'])
cids = smi_df['CID'].astype(str)


In [None]:
# Load the activity data
activity_df = pd.read_csv('preprocessed_nct.csv')  
activity_df['CID'] = activity_df['CID'].astype(str)


In [None]:
smi_df['CID'] = smi_df['CID'].astype(str)
activity_df['CID'] = activity_df['CID'].astype(str)

# Merging
merged_df = smi_df.merge(activity_df, on='CID', how='left')

# Extracting bioactivity column
y = merged_df['bioactivity_class']


In [9]:
merged_df

Unnamed: 0,SMILES,CID,CanonicalSMILES,bioactivity_class,pIC50,Canonical_SMILES,Valid_Molecule,Neutralized_SMILES,Valid_SMILES
0,[Si](CCC(N1C(c2ccc(cc2)C(F)(F)F)C(C(CC1)(F)F)C...,46225361,C[Si](C)(C)CCC(CCC(F)(F)F)N1CCC(C(C1C2=CC=C(C=...,inactive,6.193820,C[Si](C)(C)CCC(CCC(F)(F)F)N1CCC(F)(F)C(CC(=O)O...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF7...,[H]OC(=O)C([H])([H])C1([H])C([H])(c2c([H])c([H...,C[Si](C)(C)CCC(CCC(F)(F)F)N1CCC(C(C1C2=CC=C(C=...
1,[Si](CCC(N1C(c2ccc(cc2)C(F)(F)F)C(F)(F)C(CC1)C...,24964343,C[Si](C)(C)CCC(CCC(F)(F)F)N1CCC(C(C1C2=CC=C(C=...,inactive,6.638272,C[Si](C)(C)CCC(CCC(F)(F)F)N1CCC(CC(=O)O)C(F)(F...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF7...,[H]OC(=O)C([H])([H])C1([H])C([H])([H])C([H])([...,C[Si](C)(C)CCC(CCC(F)(F)F)N1CCC(C(C1C2=CC=C(C=...
2,Clc1ccc(cc1)S(=O)(=O)C1(c2c(F)ccc(c2)F)CCC(CC1...,10434562,CS(=O)(=O)NC1CCC(CC1)(C2=C(C=CC(=C2)F)F)S(=O)(...,active,8.376751,CS(=O)(=O)NC1CCC(c2cc(F)ccc2F)(S(=O)(=O)c2ccc(...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF5...,[H]c1c([H])c(F)c(C2(S(=O)(=O)c3c([H])c([H])c(C...,CS(=O)(=O)NC1CCC(CC1)(C2=C(C=CC(=C2)F)F)S(=O)(...
3,Clc1ccc(cc1)S(=O)(=O)N1C2c3c(CC1CN(C2)S(=O)(=O...,45487151,CS(=O)(=O)N1CC2CC3=C(C=NN3)C(C1)N2S(=O)(=O)C4=...,inactive,5.247644,CS(=O)(=O)N1CC2Cc3[nH]ncc3C(C1)N2S(=O)(=O)c1cc...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF5...,[H]c1nn([H])c2c1C1([H])N(S(=O)(=O)c3c([H])c([H...,CS(=O)(=O)N1CC2CC3=C(C=NN3)C(C1)N2S(=O)(=O)C4=...
4,S(=O)(=O)(c1ccc(cc1)OCC(F)(F)F)C12c3c(OCC1C(OC...,25166594,CS(=O)(=O)CCC1C2COC3=C(C=CC(=C3C2(CCO1)S(=O)(=...,inactive,5.292430,CS(=O)(=O)CCC1OCCC2(S(=O)(=O)c3ccc(OCC(F)(F)F)...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF2...,[H]c1c([H])c(F)c2c(c1F)OC([H])([H])C1([H])C([H...,CS(=O)(=O)CCC1C2COC3=C(C=CC(=C3C2(CCO1)S(=O)(=...
...,...,...,...,...,...,...,...,...,...
948,Fc1cc(F)cc(c1)C1N2C(=NOCC1O)/C(=C/c1cc(OC)c(cc...,46215514,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3CCCN4C3=NOCC(C4C...,inactive,6.619789,COc1cc(C=C2CCCN3C2=NOCC(O)C3c2cc(F)cc(F)c2)ccc...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF2...,[H]OC1([H])C([H])([H])ON=C2C(=C([H])c3c([H])c(...,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3CCCN4C3=NOCC(C4C...
949,O1N=C2N(C[C@@H](C1)c1ccccc1)CCC/C/2=C/c1cc(OC)...,46214507,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3CCCN4C3=NOCC(C4)...,inactive,6.853872,COc1cc(C=C2CCCN3CC(c4ccccc4)CON=C23)ccc1-n1cnc...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF2...,[H]C(=C1C2=NOC([H])([H])C([H])(c3c([H])c([H])c...,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3CCCN4C3=NOCC(C4)...
950,Clc1cc2c(cc1)NC(=O)/C/2=C/c1cc(OC)c(cc1)n1cnc(...,71461363,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3C4=C(C=CC(=C4)Cl...,inactive,5.173925,COc1cc(C=C2C(=O)Nc3ccc(Cl)cc32)ccc1-n1cnc(C)c1,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF2...,[H]C(=C1C(=O)N([H])c2c([H])c([H])c(Cl)c([H])c2...,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3C4=C(C=CC(=C4)Cl...
951,Fc1ccc(cc1)[C@@H](n1c2=NCCCn2/c(=C/c2cc(OC)c(c...,49865787,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3C(=O)N(C4=NCCCN3...,inactive,5.148742,COc1cc(C=c2c(=O)n(C(C)c3ccc(F)cc3)c3n2CCCN=3)c...,<rdkit.Chem.rdchem.Mol object at 0x000001C1EF5...,[H]C(c1c([H])c([H])c(-n2c([H])nc(C([H])([H])[H...,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3C(=O)N(C4=NCCCN3...


To remain with the columns CID,CanonicalSMILES, bioactivity_class and pIC50, all the other columns are dropped.

In [None]:
final_df = merged_df[['CID', 'CanonicalSMILES', 'bioactivity_class', 'pIC50']].copy()

final_df = final_df.dropna()

final_df.to_csv('molecule_data.csv', index=False)

✅ Saved only the desired columns to 'molecule_data.csv'


In [14]:
df_Y = pd.read_csv('molecule_data.csv')
df_Y

Unnamed: 0,CID,CanonicalSMILES,bioactivity_class,pIC50
0,46225361,C[Si](C)(C)CCC(CCC(F)(F)F)N1CCC(C(C1C2=CC=C(C=...,inactive,6.193820
1,24964343,C[Si](C)(C)CCC(CCC(F)(F)F)N1CCC(C(C1C2=CC=C(C=...,inactive,6.638272
2,10434562,CS(=O)(=O)NC1CCC(CC1)(C2=C(C=CC(=C2)F)F)S(=O)(...,active,8.376751
3,45487151,CS(=O)(=O)N1CC2CC3=C(C=NN3)C(C1)N2S(=O)(=O)C4=...,inactive,5.247644
4,25166594,CS(=O)(=O)CCC1C2COC3=C(C=CC(=C3C2(CCO1)S(=O)(=...,inactive,5.292430
...,...,...,...,...
948,46215514,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3CCCN4C3=NOCC(C4C...,inactive,6.619789
949,46214507,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3CCCN4C3=NOCC(C4)...,inactive,6.853872
950,71461363,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3C4=C(C=CC(=C4)Cl...,inactive,5.173925
951,49865787,CC1=CN(C=N1)C2=C(C=C(C=C2)C=C3C(=O)N(C4=NCCCN3...,inactive,5.148742


In [16]:
df_Y = df_Y['bioactivity_class']
df_Y

0      inactive
1      inactive
2        active
3      inactive
4      inactive
         ...   
948    inactive
949    inactive
950    inactive
951    inactive
952    inactive
Name: bioactivity_class, Length: 953, dtype: object

### **COMBINING X AND Y VARIABLE**

In [18]:
dataset = pd.concat([df_X,df_Y], axis=1)
dataset.to_csv('dataset.csv', index=False)
dataset

Unnamed: 0,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nC,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,bioactivity_class
0,3.5362,12.504710,79.6864,73.690583,6,6,66,35,31,23,...,68.408310,1.954523,30.832501,4.811796,3.569673,3494.0,54.0,8.663,184.0,inactive
1,3.5362,12.504710,79.6864,73.690583,6,6,66,35,31,23,...,68.394788,1.954137,30.860491,4.800371,3.564928,3540.0,55.0,8.663,184.0,inactive
2,0.0442,0.001954,53.2682,60.177860,12,12,49,29,20,19,...,57.915800,1.997097,27.112560,9.771265,2.966673,2060.0,52.0,4.479,158.0,active
3,-1.2688,1.609853,57.5675,53.323481,11,11,43,26,17,15,...,53.124389,2.043246,32.364979,9.832488,13.182591,1486.0,48.0,1.238,150.0,inactive
4,-0.3239,0.104911,69.1925,70.015239,12,12,60,37,23,23,...,74.488298,2.013197,38.103828,19.085290,0.000000,4044.0,70.0,4.643,206.0,inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,0.2383,0.056787,63.6693,71.016618,17,18,61,35,26,26,...,72.271825,2.064909,26.684300,8.475776,13.147208,3798.0,62.0,5.437,190.0,inactive
949,-0.4545,0.206570,60.0789,70.434204,17,18,60,32,28,26,...,66.727842,2.085245,19.060118,5.931073,13.129045,3145.0,54.0,7.441,172.0,inactive
950,0.7826,0.612463,38.1019,52.952688,17,18,42,26,16,20,...,53.521251,2.058510,17.463850,5.401800,9.516605,1718.0,41.0,4.073,142.0,inactive
951,-0.1795,0.032220,64.0521,70.757618,22,23,60,34,26,26,...,70.136479,2.062838,24.573258,5.436347,16.617795,3537.0,58.0,5.663,186.0,inactive


The dataset is ready for machine learning.