# Fingerprint generator

This notebook is used to generate the fingerprints from RDKit that are used as training data for the ML models.

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator, MACCSkeys, DataStructs

from mhfp.encoder import MHFPEncoder

# Load dataset

In [2]:
amr_df = pd.read_csv("../data/processed/standardized_data.tsv", sep="\t")
amr_df.head()

Unnamed: 0,cmp_id,inchikey,smiles,gram-negative,gram-positive,acid-fast,best
0,pubchem.compound:2850780,AAAFNFZYVYYALD-UHFFFAOYSA-N,COCC(C)NCc1ccc(C)cc1C,0.0,3.71,0.0,gram-positive
1,spark:SPK-0108052,AABJDBIUWZVBBU-UHFFFAOYSA-N,FC1=CC=CC=C1N1C=C(CN2C=NC3=CC=CC=C23)N=N1,3.4,0.0,0.0,gram-negative
2,chembl:CHEMBL3601596,AABKQBNCAREVSD-UHFFFAOYSA-N,Cc1ccc(C2=C(O)COC2=O)cc1,0.0,5.07,0.0,gram-positive
3,chembl:CHEMBL1649587,AABLZXVSOORENQ-UHFFFAOYSA-N,Nc1ncc(-c2ccc(-c3ccccc3)cc2)n1C1CCCC1,5.12,0.0,0.0,gram-negative
4,chembl:CHEMBL2323136,AABXXZMFZZDFMA-OJAPUXPCSA-N,CC(=O)OC[C@H]1O[C@@H](CC(=O)/C=C/c2cccc(O)c2)[...,0.0,0.0,6.06,acid-fast


# Generate the different fingerprints

In [3]:
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=4, fpSize=1024) # ECFP4 fingerprint
rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=1024) # RDKit fingerprint

In [4]:
# Disable warnings
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")

In [5]:
ecfp_fingerprints = []
rdkit_fingerprints = []
maccs_fingerprints = []

for smiles in tqdm(amr_df["smiles"]):
    # Canonicalize the smiles
    try:
        can_smiles = Chem.CanonSmiles(smiles)
    except:
        can_smiles = smiles

    # Generate the mol object
    mol = Chem.MolFromSmiles(can_smiles)

    if not mol:
        ecfp_fingerprints.append(None)
        rdkit_fingerprints.append(None)
        maccs_fingerprints.append(None)
        continue

    ecfp_fingerprints.append(mfpgen.GetFingerprint(mol))
    rdkit_fingerprints.append(rdkgen.GetFingerprint(mol))
    maccs_fingerprints.append(MACCSkeys.GenMACCSKeys(mol))

100%|██████████| 53909/53909 [01:57<00:00, 460.20it/s]


In [6]:
amr_df['ecfp4'] = ecfp_fingerprints
amr_df['rdkit'] = rdkit_fingerprints
amr_df['maccs'] = maccs_fingerprints

In [7]:
# minihash fingerprint
mhfp_encoder = MHFPEncoder()

mhfp_fingerprints = []

for smiles in tqdm(amr_df["smiles"]):
    # Canonicalize the smiles
    try:
        can_smiles = Chem.CanonSmiles(smiles)
    except:
        can_smiles = smiles

    # Generate fingerprint
    try:
        fp = mhfp_encoder.encode(can_smiles)
    except:
        fp = None
    
    mhfp_fingerprints.append(fp)

amr_df['mhfp'] = mhfp_fingerprints

100%|██████████| 53909/53909 [05:59<00:00, 149.93it/s]


In [8]:
amr_df.head(3)

Unnamed: 0,cmp_id,inchikey,smiles,gram-negative,gram-positive,acid-fast,best,ecfp4,rdkit,maccs,mhfp
0,pubchem.compound:2850780,AAAFNFZYVYYALD-UHFFFAOYSA-N,COCC(C)NCc1ccc(C)cc1C,0.0,3.71,0.0,gram-positive,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[113524128, 44409534, 152899168, 10679139, 584..."
1,spark:SPK-0108052,AABJDBIUWZVBBU-UHFFFAOYSA-N,FC1=CC=CC=C1N1C=C(CN2C=NC3=CC=CC=C23)N=N1,3.4,0.0,0.0,gram-negative,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[143867129, 25141730, 29077173, 111863329, 504..."
2,chembl:CHEMBL3601596,AABKQBNCAREVSD-UHFFFAOYSA-N,Cc1ccc(C2=C(O)COC2=O)cc1,0.0,5.07,0.0,gram-positive,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[19469312, 44409534, 45380089, 49283527, 58448..."


In [9]:
amr_df.to_pickle("../data/processed/standardized_with_fingerprints.pkl")

# Formatting data for each fingerprint

Each fingerprint vector is now converted to bit columns i.e. one bit per column.

In [10]:
os.makedirs("../data/fingerprints", exist_ok=True)

### ECFP4

In [11]:
ecfp4_df = amr_df[['cmp_id', 'ecfp4', 'best']]
ecfp4_df = ecfp4_df.dropna() # Drop rows with no fingerprint
ecfp4_df.shape

(53900, 3)

In [12]:
data = []

for idx, ecfp_fp, best in tqdm(ecfp4_df.values):
    # Convert fingerprint to numpy array
    fp = np.zeros((0,), dtype=int)
    DataStructs.ConvertToNumpyArray(ecfp_fp, fp)

    # Convert to dataframe
    t = pd.DataFrame(fp).T
    t.rename(columns=lambda x: "bit" + str(x), inplace=True)

    # Add metadata
    t['cmp_id'] = idx
    t['label'] = best

    data.append(t)

ecfp4_dataframe = pd.concat(data, ignore_index=True)
ecfp4_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 53900/53900 [00:23<00:00, 2342.22it/s]


In [13]:
ecfp4_dataframe.to_csv("../data/fingerprints/ecfp4.tsv", sep="\t")

### RDKit

In [14]:
rdkit_df = amr_df[['cmp_id', 'rdkit', 'best']]
rdkit_df = rdkit_df.dropna() # Drop rows with no fingerprint
rdkit_df.shape

(53900, 3)

In [15]:
data = []

for idx, rdkit_fp, best in tqdm(rdkit_df.values):
    # Convert fingerprint to numpy array
    fp = np.zeros((0,), dtype=int)
    DataStructs.ConvertToNumpyArray(rdkit_fp, fp)

    # Convert to dataframe
    t = pd.DataFrame(fp).T
    t.rename(columns=lambda x: "bit" + str(x), inplace=True)

    # Add metadata
    t['cmp_id'] = idx
    t['label'] = best

    data.append(t)

rdkit_dataframe = pd.concat(data, ignore_index=True)
rdkit_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 53900/53900 [00:22<00:00, 2386.92it/s]


In [16]:
rdkit_dataframe.to_csv("../data/fingerprints/rdkit.tsv", sep="\t")

### MACCS

In [17]:
maccs_df = amr_df[['cmp_id', 'maccs', 'best']]
maccs_df = maccs_df.dropna() # Drop rows with no fingerprint
maccs_df.shape

(53900, 3)

In [18]:
data = []

for idx, maccs_fp, best in tqdm(maccs_df.values):
    # Convert fingerprint to numpy array
    fp = np.zeros((0,), dtype=int)
    DataStructs.ConvertToNumpyArray(maccs_fp, fp)

    # Convert to dataframe
    t = pd.DataFrame(fp).T
    t.rename(columns=lambda x: "bit" + str(x), inplace=True)

    # Add metadata
    t['cmp_id'] = idx
    t['label'] = best

    data.append(t)

maccs_dataframe = pd.concat(data, ignore_index=True)
maccs_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 53900/53900 [00:13<00:00, 4111.36it/s]


In [19]:
maccs_dataframe.to_csv("../data/fingerprints/maccs.tsv", sep="\t")

### MHFP6

In [20]:
mhfp6_df = amr_df[['cmp_id', 'mhfp', 'best']]
mhfp6_df = mhfp6_df.dropna() # Drop rows with no fingerprint
mhfp6_df.shape

(53900, 3)

In [21]:
data = []

for idx, mhfp6_fp, best in tqdm(mhfp6_df.values):
    # Convert to dataframe
    t = pd.DataFrame(mhfp6_fp).T
    t.rename(columns=lambda x: "bit" + str(x), inplace=True)

    # Add metadata
    t['cmp_id'] = idx
    t['label'] = best

    data.append(t)

mhfp6_dataframe = pd.concat(data, ignore_index=True)
mhfp6_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 53900/53900 [00:31<00:00, 1720.75it/s]


In [22]:
mhfp6_dataframe.to_csv("../data/fingerprints/mhfp6.tsv", sep="\t")