# Fingerprint generator

This notebook is used to generate the fingerprints from RDKit that are used as training data for the ML models.

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem import (
    rdFingerprintGenerator,
    MACCSkeys,
    DataStructs,
    rdMolDescriptors,
    Descriptors,
    GraphDescriptors,
    rdReducedGraphs,
)

from mhfp.encoder import MHFPEncoder

In [2]:
# Disable warnings
from rdkit import RDLogger
import warnings

RDLogger.DisableLog("rdApp.*")
warnings.filterwarnings("ignore")

# Load dataset

In [3]:
amr_df = pd.read_csv("../data/processed/combined_bioassay_data.tsv", sep="\t")
amr_df.head()

Unnamed: 0,compound_inchikey,compound_smiles,compound_source,gram-positive,gram-negative,fungi,acid-fast,chemical_class,compound_superclass,compound_pathway,best_class
0,OOYGSFOGFJDDHP-KMCOLRRFSA-N,NC[C@H]1O[C@H](O[C@H]2[C@H](O)[C@@H](O[C@H]3O[...,chembl_34,5.0,5.0,,6.0,"['Amino cyclitols', 'Aminoglycosides']","['Aminosugars and aminoglycosides', 'Polyols']",['Carbohydrates'],acid-fast
1,XIPHLJFTBFXVBS-UHFFFAOYSA-N,C=C(C(=O)c1ccc(F)cc1)c1ccc(Cl)cc1Cl,chembl_34,,,6.0,,['Chalcones'],['Flavonoids'],['Shikimates and Phenylpropanoids'],fungi
2,OEFUWWDPRNNMRS-WDPNLORNSA-N,CC[C@H]1OC(=O)[C@H](C)[C@H]2OC3(CCN(C(=O)c4ccc...,chembl_34,6.0,6.5,,,['Erythromycins'],['Macrolides'],['Polyketides'],gram-negative
3,LBRXTHSVIORIGU-OLROFJLRSA-N,CC[C@H]1OC(=O)[C@H](C)[C@H]2OC3(CCN(C(=O)c4cnc...,chembl_34,6.0,6.0,,,['Erythromycins'],['Macrolides'],['Polyketides'],gram-positive
4,PHYLUFIYANLQSE-UHFFFAOYSA-N,CN1Cc2csc3c(C(=O)O)c(=O)c4cc(F)c(N5CCOC(CF)C5)...,chembl_34,7.5,7.0,,,[],['Tryptophan alkaloids'],['Alkaloids'],gram-positive


# Generate the different fingerprints

In [4]:
if os.path.exists("../data/processed/full_data_with_fingerprints.pkl"):
    full_data = pd.read_pickle("../data/processed/full_data_with_fingerprints.pkl")
else:
    full_data = pd.DataFrame()

In [5]:
if full_data.empty:
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(
        radius=4, fpSize=1024
    )  # ECFP4 fingerprint
    rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(
        fpSize=1024
    )  # RDKit fingerprint
    mhfp_encoder = MHFPEncoder(n_permutations=2048, seed=42)  # MHFP6 fingerprint

    ecfp_fingerprints = []
    rdkit_fingerprints = []
    maccs_fingerprints = []
    mhfp6_fingerprints = []
    erg_fingerprints = []
    chem_phys_props = []

    for inchikey, smiles in tqdm(
        amr_df[["compound_inchikey", "compound_smiles"]].values
    ):
        # Canonicalize the smiles
        try:
            can_smiles = Chem.CanonSmiles(smiles)
        except:
            can_smiles = smiles

        # Generate the mol object
        mol = Chem.MolFromSmiles(can_smiles)

        if not mol:
            ecfp_fingerprints.append(None)
            rdkit_fingerprints.append(None)
            maccs_fingerprints.append(None)
            chem_phys_props.append(None)
            mhfp_encoder.append(None)
            erg_fingerprints.append(None)
            continue

        ecfp_fingerprints.append(mfpgen.GetFingerprint(mol))
        rdkit_fingerprints.append(rdkgen.GetFingerprint(mol))
        maccs_fingerprints.append(MACCSkeys.GenMACCSKeys(mol))
        mhfp6_fingerprints.append(mhfp_encoder.encode(can_smiles, radius=3))
        erg_fingerprints.append(rdReducedGraphs.GetErGFingerprint(mol))

        vals = Descriptors.CalcMolDescriptors(mol)

        chem_phys_props.append(
            {
                "slogp": round(vals["MolLogP"], 2),
                "smr": round(vals["MolMR"], 2),
                "labute_asa": round(vals["LabuteASA"], 2),
                "tpsa": round(vals["TPSA"], 2),
                "exact_mw": round(vals["ExactMolWt"], 2),
                "num_lipinski_hba": rdMolDescriptors.CalcNumLipinskiHBA(mol),
                "num_lipinski_hbd": rdMolDescriptors.CalcNumLipinskiHBD(mol),
                "num_rotatable_bonds": vals["NumRotatableBonds"],
                "num_hba": vals["NumHAcceptors"],
                "num_hbd": vals["NumHDonors"],
                "num_amide_bonds": rdMolDescriptors.CalcNumAmideBonds(mol),
                "num_heteroatoms": vals["NumHeteroatoms"],
                "num_heavy_atoms": vals["HeavyAtomCount"],
                "num_atoms": rdMolDescriptors.CalcNumAtoms(mol),
                "num_stereocenters": rdMolDescriptors.CalcNumAtomStereoCenters(mol),
                "num_unspecified_stereocenters": rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters(
                    mol
                ),
                "num_rings": vals["RingCount"],
                "num_aromatic_rings": vals["NumAromaticRings"],
                "num_aliphatic_rings": vals["NumAliphaticRings"],
                "num_saturated_rings": vals["NumSaturatedRings"],
                "num_aromatic_heterocycles": vals["NumAromaticHeterocycles"],
                "num_aliphatic_heterocycles": vals["NumAliphaticHeterocycles"],
                "num_saturated_heterocycles": vals["NumSaturatedHeterocycles"],
                "num_aromatic_carbocycles": vals["NumAromaticCarbocycles"],
                "num_aliphatic_carbocycles": vals["NumAliphaticCarbocycles"],
                "num_saturated_carbocycles": vals["NumSaturatedCarbocycles"],
                "fraction_csp3": round(vals["FractionCSP3"], 2),
                "num_brdigehead_atoms": rdMolDescriptors.CalcNumBridgeheadAtoms(mol),
                "bertz_complexity": GraphDescriptors.BertzCT(mol),
            }
        )

    amr_df["ecfp4"] = ecfp_fingerprints
    amr_df["rdkit"] = rdkit_fingerprints
    amr_df["maccs"] = maccs_fingerprints
    amr_df["chem_phys"] = chem_phys_props
    amr_df["mhfp6"] = mhfp6_fingerprints
    amr_df["erg"] = erg_fingerprints
    amr_df.to_pickle("../data/processed/full_data_with_fingerprints.pkl")
    full_data = amr_df

100%|██████████| 74202/74202 [38:17<00:00, 32.29it/s]   


# Formatting data for each fingerprint

Each fingerprint vector is now converted to bit columns i.e. one bit per column.

In [6]:
os.makedirs("../data/fingerprints", exist_ok=True)

In [7]:
full_data.head(2)

Unnamed: 0,compound_inchikey,compound_smiles,compound_source,gram-positive,gram-negative,fungi,acid-fast,chemical_class,compound_superclass,compound_pathway,best_class,ecfp4,rdkit,maccs,chem_phys,mhfp6,erg
0,OOYGSFOGFJDDHP-KMCOLRRFSA-N,NC[C@H]1O[C@H](O[C@H]2[C@H](O)[C@@H](O[C@H]3O[...,chembl_34,5.0,5.0,,6.0,"['Amino cyclitols', 'Aminoglycosides']","['Aminosugars and aminoglycosides', 'Polyols']",['Carbohydrates'],acid-fast,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'slogp': -7.94, 'smr': 121.97, 'labute_asa': ...","[53109374, 13294028, 17313015, 13405020, 15956...","[0.3, 2.2, 6.3999999999999995, 9.4, 7.29999999..."
1,XIPHLJFTBFXVBS-UHFFFAOYSA-N,C=C(C(=O)c1ccc(F)cc1)c1ccc(Cl)cc1Cl,chembl_34,,,6.0,,['Chalcones'],['Flavonoids'],['Shikimates and Phenylpropanoids'],fungi,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'slogp': 5.03, 'smr': 76.23, 'labute_asa': 11...","[2376200, 75861701, 8411880, 265132626, 171838...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
for fingerprint_name in ["ecfp4", "rdkit", "maccs", "chem_phys", "mhfp6", "erg"]:
    if os.path.exists(f"../data/fingerprints/combined_{fingerprint_name}.tsv"):
        print(f"Skipping {fingerprint_name}! File already exists.")
        continue

    fingerprint_df = full_data[
        [fingerprint_name, "best_class", "compound_inchikey"]
    ].dropna()
    print(f"{fingerprint_name}: {fingerprint_df.shape}")

    data = []

    if fingerprint_name == "chem_phys":
        for chemphys, best, inchikey in tqdm(fingerprint_df.values):
            # Convert to dataframe
            t = pd.DataFrame([chemphys])

            # Add metadata
            t["cmp_id"] = inchikey
            t["label"] = best

            data.append(t)

    elif fingerprint_name in ["mhfp6", "erg"]:
        for mhfp, best, inchikey in tqdm(fingerprint_df.values):
            # Convert to dataframe
            t = pd.DataFrame(mhfp).T
            t.rename(columns=lambda x: "bit" + str(x), inplace=True)

            # Add metadata
            t["cmp_id"] = inchikey
            t["label"] = best

            data.append(t)

    else:
        data = []

        for fp_list, best, inchikey in tqdm(fingerprint_df.values):
            # Convert fingerprint to numpy array
            fp = np.zeros((0,), dtype=int)
            DataStructs.ConvertToNumpyArray(fp_list, fp)

            # Convert to dataframe
            t = pd.DataFrame(fp).T
            t.rename(columns=lambda x: "bit" + str(x), inplace=True)

            # Add metadata
            t["cmp_id"] = inchikey
            t["label"] = best

            data.append(t)

    tmp_dataframe = pd.concat(data, ignore_index=True)

    tmp_dataframe.to_csv(
        f"../data/fingerprints/combined_{fingerprint_name}.tsv", sep="\t", index=False
    )

ecfp4: (74202, 3)


100%|██████████| 74202/74202 [00:33<00:00, 2212.63it/s]


rdkit: (74202, 3)


100%|██████████| 74202/74202 [00:35<00:00, 2097.75it/s]


maccs: (74202, 3)


100%|██████████| 74202/74202 [00:21<00:00, 3472.98it/s]


chem_phys: (74202, 3)


100%|██████████| 74202/74202 [00:28<00:00, 2637.54it/s]


mhfp6: (74202, 3)


100%|██████████| 74202/74202 [00:47<00:00, 1563.12it/s]


erg: (74202, 3)


100%|██████████| 74202/74202 [00:23<00:00, 3206.02it/s]
