# Fingerprint generator

This notebook is used to generate the fingerprints from RDKit that are used as training data for the ML models.

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator, MACCSkeys, DataStructs
from rdkit.Chem import rdMolDescriptors, Descriptors, GraphDescriptors

from mhfp.encoder import MHFPEncoder

# Load dataset

In [2]:
amr_df = pd.read_csv("../data/processed/combined_bioassay_data.tsv", sep="\t")
amr_df.head()

Unnamed: 0,compound_inchikey,compound_smiles,compound_source,gram-positive,gram-negative,fungi,acid-fast,chemical_class,compound_superclass,compound_pathway,best_class
0,OOYGSFOGFJDDHP-KMCOLRRFSA-N,NC[C@H]1O[C@H](O[C@H]2[C@H](O)[C@@H](O[C@H]3O[...,chembl_33,5.0,5.0,,6.0,"['Amino cyclitols', 'Aminoglycosides']","['Aminosugars and aminoglycosides', 'Polyols']",['Carbohydrates'],acid-fast
1,XIPHLJFTBFXVBS-UHFFFAOYSA-N,C=C(C(=O)c1ccc(F)cc1)c1ccc(Cl)cc1Cl,chembl_33,,,6.0,,['Chalcones'],['Flavonoids'],['Shikimates and Phenylpropanoids'],fungi
2,OEFUWWDPRNNMRS-WDPNLORNSA-N,CC[C@H]1OC(=O)[C@H](C)[C@H]2OC3(CCN(C(=O)c4ccc...,chembl_33,6.0,6.5,,,['Erythromycins'],['Macrolides'],['Polyketides'],gram-negative
3,LBRXTHSVIORIGU-OLROFJLRSA-N,CC[C@H]1OC(=O)[C@H](C)[C@H]2OC3(CCN(C(=O)c4cnc...,chembl_33,6.0,6.0,,,['Erythromycins'],['Macrolides'],['Polyketides'],gram-positive
4,PHYLUFIYANLQSE-UHFFFAOYSA-N,CN1Cc2csc3c(C(=O)O)c(=O)c4cc(F)c(N5CCOC(CF)C5)...,chembl_33,7.5,7.0,,,[],['Tryptophan alkaloids'],['Alkaloids'],gram-positive


# Generate the different fingerprints

In [3]:
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=4, fpSize=1024) # ECFP4 fingerprint
rdkgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=1024) # RDKit fingerprint

In [4]:
# Disable warnings
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")

In [5]:
ecfp_fingerprints = []
rdkit_fingerprints = []
maccs_fingerprints = []
chem_phys_props = []

for inchikey, smiles in tqdm(amr_df[["compound_inchikey", "compound_smiles"]].values):
    # Canonicalize the smiles
    try:
        can_smiles = Chem.CanonSmiles(smiles)
    except:
        can_smiles = smiles

    # Generate the mol object
    mol = Chem.MolFromSmiles(can_smiles)

    if not mol:
        ecfp_fingerprints.append(None)
        rdkit_fingerprints.append(None)
        maccs_fingerprints.append(None)
        chem_phys_props.append(None)
        inchikey_list.append(inchikey)
        continue

    ecfp_fingerprints.append(mfpgen.GetFingerprint(mol))
    rdkit_fingerprints.append(rdkgen.GetFingerprint(mol))
    maccs_fingerprints.append(MACCSkeys.GenMACCSKeys(mol))

    vals = Descriptors.CalcMolDescriptors(mol)

    chem_phys_props.append({
        'slogp': round(vals['MolLogP'], 2),
        'smr': round(vals['MolMR'], 2),
        'labute_asa': round(vals['LabuteASA'], 2),
        'tpsa': round(vals['TPSA'], 2),
        'exact_mw': round(vals['ExactMolWt'], 2),
        'num_lipinski_hba': rdMolDescriptors.CalcNumLipinskiHBA(mol),
        'num_lipinski_hbd': rdMolDescriptors.CalcNumLipinskiHBD(mol),
        'num_rotatable_bonds': vals['NumRotatableBonds'],
        'num_hba': vals['NumHAcceptors'],
        'num_hbd': vals['NumHDonors'],
        'num_amide_bonds': rdMolDescriptors.CalcNumAmideBonds(mol),
        'num_heteroatoms': vals['NumHeteroatoms'],
        'num_heavy_atoms': vals['HeavyAtomCount'],
        'num_atoms': rdMolDescriptors.CalcNumAtoms(mol),
        'num_stereocenters': rdMolDescriptors.CalcNumAtomStereoCenters(mol),
        'num_unspecified_stereocenters': rdMolDescriptors.CalcNumUnspecifiedAtomStereoCenters(mol),
        'num_rings': vals['RingCount'],
        'num_aromatic_rings': vals['NumAromaticRings'],
        'num_aliphatic_rings': vals['NumAliphaticRings'],
        'num_saturated_rings': vals['NumSaturatedRings'],
        'num_aromatic_heterocycles': vals['NumAromaticHeterocycles'],
        'num_aliphatic_heterocycles': vals['NumAliphaticHeterocycles'],
        'num_saturated_heterocycles': vals['NumSaturatedHeterocycles'],
        'num_aromatic_carbocycles': vals['NumAromaticCarbocycles'],
        'num_aliphatic_carbocycles': vals['NumAliphaticCarbocycles'],
        'num_saturated_carbocycles': vals['NumSaturatedCarbocycles'],
        'fraction_csp3': round(vals['FractionCSP3'], 2),
        'num_brdigehead_atoms': rdMolDescriptors.CalcNumBridgeheadAtoms(mol),
        'bertz_complexity': GraphDescriptors.BertzCT(mol),
    })

100%|██████████| 77442/77442 [1:17:52<00:00, 16.57it/s]  


In [6]:
amr_df['ecfp4'] = ecfp_fingerprints
amr_df['rdkit'] = rdkit_fingerprints
amr_df['maccs'] = maccs_fingerprints
amr_df['chem_phys'] = chem_phys_props

In [7]:
# minihash fingerprint
mhfp_encoder = MHFPEncoder()

mhfp_fingerprints = []

for smiles in tqdm(amr_df["compound_smiles"]):
    # Canonicalize the smiles
    try:
        can_smiles = Chem.CanonSmiles(smiles)
    except:
        can_smiles = smiles

    # Generate fingerprint
    try:
        fp = mhfp_encoder.encode(can_smiles)
    except:
        fp = None
    
    mhfp_fingerprints.append(fp)

amr_df['mhfp'] = mhfp_fingerprints

100%|██████████| 77442/77442 [22:56<00:00, 56.24it/s]  


In [8]:
amr_df.head(1)

Unnamed: 0,compound_inchikey,compound_smiles,compound_source,gram-positive,gram-negative,fungi,acid-fast,chemical_class,compound_superclass,compound_pathway,best_class,ecfp4,rdkit,maccs,chem_phys,mhfp
0,OOYGSFOGFJDDHP-KMCOLRRFSA-N,NC[C@H]1O[C@H](O[C@H]2[C@H](O)[C@@H](O[C@H]3O[...,chembl_33,5.0,5.0,,6.0,"['Amino cyclitols', 'Aminoglycosides']","['Aminosugars and aminoglycosides', 'Polyols']",['Carbohydrates'],acid-fast,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'slogp': -7.94, 'smr': 121.97, 'labute_asa': ...","[53109374, 13294028, 17313015, 13405020, 15956..."


In [9]:
amr_df.to_pickle("../data/processed/full_data_with_fingerprints.pkl")

# Formatting data for each fingerprint

Each fingerprint vector is now converted to bit columns i.e. one bit per column.

In [11]:
os.makedirs("../data/fingerprints", exist_ok=True)

### ECFP4

In [12]:
ecfp4_df = amr_df[['ecfp4', 'best_class', 'compound_inchikey']]
ecfp4_df = ecfp4_df.dropna() # Drop rows with no fingerprint
ecfp4_df.shape

(77442, 3)

In [14]:
data = []

for ecfp_fp, best, inchikey in tqdm(ecfp4_df.values, total=ecfp4_df.shape[0]):
    # Convert fingerprint to numpy array
    fp = np.zeros((0,), dtype=int)
    DataStructs.ConvertToNumpyArray(ecfp_fp, fp)

    # Convert to dataframe
    t = pd.DataFrame(fp).T
    t.rename(columns=lambda x: "bit" + str(x), inplace=True)

    # Add metadata
    t['cmp_id'] = inchikey
    t['label'] = best

    data.append(t)

ecfp4_dataframe = pd.concat(data, ignore_index=True)
ecfp4_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 77442/77442 [02:29<00:00, 517.06it/s]


In [15]:
ecfp4_dataframe.to_csv("../data/fingerprints/combined_ecfp4.tsv", sep="\t")

### RDKit

In [16]:
rdkit_df = amr_df[['rdkit', 'best_class', 'compound_inchikey']]
rdkit_df = rdkit_df.dropna() # Drop rows with no fingerprint
rdkit_df.shape

(77442, 3)

In [17]:
data = []

for rdkit_fp, best, inchikey in tqdm(rdkit_df.values, total=rdkit_df.shape[0]):
    # Convert fingerprint to numpy array
    fp = np.zeros((0,), dtype=int)
    DataStructs.ConvertToNumpyArray(rdkit_fp, fp)

    # Convert to dataframe
    t = pd.DataFrame(fp).T
    t.rename(columns=lambda x: "bit" + str(x), inplace=True)

    # Add metadata
    t['cmp_id'] = inchikey
    t['label'] = best

    data.append(t)

rdkit_dataframe = pd.concat(data, ignore_index=True)
rdkit_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 77442/77442 [02:22<00:00, 544.76it/s]


In [18]:
rdkit_dataframe.to_csv("../data/fingerprints/combined_rdkit.tsv", sep="\t")

### MACCS

In [19]:
maccs_df = amr_df[['maccs', 'best_class', 'compound_inchikey']]
maccs_df = maccs_df.dropna() # Drop rows with no fingerprint
maccs_df.shape

(77442, 3)

In [20]:
data = []

for maccs_fp, best, inchikey in tqdm(maccs_df.values, total=maccs_df.shape[0]):
    # Convert fingerprint to numpy array
    fp = np.zeros((0,), dtype=int)
    DataStructs.ConvertToNumpyArray(maccs_fp, fp)

    # Convert to dataframe
    t = pd.DataFrame(fp).T
    t.rename(columns=lambda x: "bit" + str(x), inplace=True)

    # Add metadata
    t['cmp_id'] = inchikey
    t['label'] = best

    data.append(t)

maccs_dataframe = pd.concat(data, ignore_index=True)
maccs_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 77442/77442 [01:32<00:00, 833.87it/s]


In [21]:
maccs_dataframe.to_csv("../data/fingerprints/combined_maccs.tsv", sep="\t")

### MHFP6

In [22]:
mhfp6_df = amr_df[['mhfp', 'best_class', 'compound_inchikey']]
mhfp6_df = mhfp6_df.dropna() # Drop rows with no fingerprint
mhfp6_df.shape

(77442, 3)

In [23]:
data = []

for mhfp6_fp, best, inchikey in tqdm(mhfp6_df.values, total=mhfp6_df.shape[0]):
    # Convert to dataframe
    t = pd.DataFrame(mhfp6_fp).T
    t.rename(columns=lambda x: "bit" + str(x), inplace=True)

    # Add metadata
    t['cmp_id'] = inchikey
    t['label'] = best

    data.append(t)

mhfp6_dataframe = pd.concat(data, ignore_index=True)
mhfp6_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 77442/77442 [03:30<00:00, 367.24it/s]


In [24]:
mhfp6_dataframe.to_csv("../data/fingerprints/combined_mhfp6.tsv", sep="\t")

### ChemPhys

In [25]:
chemphys_df = amr_df[['chem_phys', 'best_class', 'compound_inchikey']]
chemphys_df = chemphys_df.dropna() # Drop rows with no fingerprint
chemphys_df.shape

(77442, 3)

In [26]:
data = []

for chemphys, best, inchikey in tqdm(chemphys_df.values, total=chemphys_df.shape[0]):
    # Convert to dataframe
    t = pd.DataFrame([chemphys])

    # Add metadata
    t['cmp_id'] = inchikey
    t['label'] = best

    data.append(t)

chemphys_dataframe = pd.concat(data, ignore_index=True)
chemphys_dataframe.set_index('cmp_id', inplace=True)

100%|██████████| 77442/77442 [03:04<00:00, 419.86it/s]


In [27]:
chemphys_dataframe.to_csv("../data/fingerprints/combined_chemphys.tsv", sep="\t")