In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read dataset
df = pd.read_csv("chembl_malaria_inhibitors.csv")

# Display first 5 rows
print(df.head())

# Check for missing values
print(df.isnull().sum())

  Molecule ChEMBL ID                                             Smiles  \
0       CHEMBL444605              CC(=O)OCCCc1nc(N)nc(N)c1-c1cccc(Cl)c1   
1        CHEMBL21395               CCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1ccccc1   
2       CHEMBL291931                COc1ccc(Cc2cnc(N)nc2N)cc1OCc1ccccc1   
3       CHEMBL274031                Nc1ncc(Cc2cccc(OCc3ccccc3)c2)c(N)n1   
4        CHEMBL22139  CCCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1cc(OC)c(OC)c(OC)c1   

   pChEMBL Value  
0           5.28  
1           6.92  
2           5.24  
3           4.89  
4           7.52  
Molecule ChEMBL ID    0
Smiles                0
pChEMBL Value         0
dtype: int64


In [3]:
# Check for missing values
print(df.isnull().sum())

Molecule ChEMBL ID    0
Smiles                0
pChEMBL Value         0
dtype: int64


In [4]:
 #Dataset dimensions
print("Shape of dataset:", df.shape, "\n")

Shape of dataset: (340, 3) 



In [5]:
# Step 1 — Importing Libraries
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Avalon import pyAvalonTools
import numpy as np
import pandas as pd
# Step 2 — Converting SMILES to Molecules
def smiles_to_mols(smiles_list):
    mols = []
    valid_idx = []
    for i, smi in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smi)
        if mol:
            mols.append(mol)
            valid_idx.append(i)
    return mols, valid_idx
# Step 3 — Calculating Fingerprints
def calculate_fingerprints(df, smiles_col='Smiles', method='ecfp', radius=2, nBits=1024):
    """
    Calculates ECFP or Avalon fingerprints and returns a merged DataFrame.

    Parameters:
        df (pd.DataFrame): Input DataFrame with a column of SMILES strings.
        smiles_col (str): Name of the SMILES column.
        method (str): 'ecfp' or 'avalon'.
        radius (int): Radius for ECFP (ignored for Avalon).
        nBits (int): Number of bits for the fingerprint.

    Returns:
        pd.DataFrame: Original DataFrame concatenated with fingerprint bits.
    """
    # Step 4 — Generating the Fingerprints
    smiles_list = df[smiles_col]
    mols, valid_idx = smiles_to_mols(smiles_list)
    fingerprints = []

    for mol in mols:
        if method == 'ecfp':
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
        elif method == 'avalon':
            fp = pyAvalonTools.GetAvalonFP(mol, nBits=nBits)
        else:
            raise ValueError("Method must be 'ecfp' or 'avalon'")
        # Step 5 — Converting Fingerprints to Numbers
        fp_np = np.zeros((nBits,), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, fp_np)
        fingerprints.append(fp_np)

    # Create DataFrame for fingerprints
    prefix = method.upper()
    fp_df = pd.DataFrame(fingerprints, columns=[f'{prefix}_{i}' for i in range(nBits)])
    fp_df.index = valid_idx

    # Keep only valid molecules
    df_valid = df.iloc[valid_idx].reset_index(drop=True)
    fp_df = fp_df.reset_index(drop=True)

    # Concatenate and return
    return pd.concat([df_valid, fp_df], axis=1)

In [6]:
df_with_ecfp = calculate_fingerprints(df, smiles_col='Smiles', method='avalon', radius=2, nBits=1024)

In [7]:
df_with_ecfp

Unnamed: 0,Molecule ChEMBL ID,Smiles,pChEMBL Value,AVALON_0,AVALON_1,AVALON_2,AVALON_3,AVALON_4,AVALON_5,AVALON_6,...,AVALON_1014,AVALON_1015,AVALON_1016,AVALON_1017,AVALON_1018,AVALON_1019,AVALON_1020,AVALON_1021,AVALON_1022,AVALON_1023
0,CHEMBL444605,CC(=O)OCCCc1nc(N)nc(N)c1-c1cccc(Cl)c1,5.28,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,CHEMBL21395,CCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1ccccc1,6.92,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL291931,COc1ccc(Cc2cnc(N)nc2N)cc1OCc1ccccc1,5.24,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL274031,Nc1ncc(Cc2cccc(OCc3ccccc3)c2)c(N)n1,4.89,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL22139,CCCOc1cc(Cc2cnc(N)nc2N)ccc1OCc1cc(OC)c(OC)c(OC)c1,7.52,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,CHEMBL747,CC1(C)N=C(N)N=C(N)N1c1ccc(Cl)cc1,7.40,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
336,CHEMBL5822647,CCCc1nc(N)nc(N)c1N1CCN(c2ccccc2)CC1,7.85,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
337,CHEMBL34259,CN(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)N[C@@H](CC...,9.05,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
338,CHEMBL36,CCc1nc(N)nc(N)c1-c1ccc(Cl)cc1,8.28,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df_with_ecfp.to_csv("chembl_fingerprint.csv", index=False)