In [32]:
import os
import pandas as pd

path = os.path.join("data", "datasets")
file = os.path.join(path, "compounds_ms2structures.csv")
compounds = pd.read_csv(file)
compounds

Unnamed: 0,inchikey,smiles,mass,cf_class,cf_subclass,cf_superclass,formula,npc_class_results,npc_pathway_results,npc_superclass_results
0,AAAQFGUYHFJNHI,CCNC(=O)C[C@H]1C2=NN=C(N2C3=C(C=C(C=C3)OC)C(=N...,423.146204,Benzodiazepines,"1,4-benzodiazepines",Organoheterocyclic compounds,C22H22ClN5O2,,Alkaloids,
1,AABFWJDLCCDJJN,COC1=CC2=C(C=C1)NC3=C2C=CN=C3C4=CC=CC5=CC=CC=C54,324.126264,Harmala alkaloids,,Alkaloids and derivatives,C22H16N2O,Carboline alkaloids,Alkaloids,Tryptophan alkaloids
2,AABILZKQMVKFHP,C/C=C(/C)\C(=O)O[C@H]1CC[N+]2([C@@H]1C(=CC2)CO...,427.220624,,,Alkaloids and derivatives,C21H33NO8,Pyrrolizidine alkaloids,Alkaloids,Ornithine alkaloids
3,AABUHSBGEIUSRJ,CC(=O)NC1=CC=C(C=C1)NC(=O)C=CC2=CC=CC=C2,280.120724,Cinnamic acids and derivatives,Cinnamic acid amides,Phenylpropanoids and polyketides,C17H16N2O2,Cinnamic acid amides,Shikimates and Phenylpropanoids,Phenylpropanoids (C6-C3)
4,AABUKWVVUWBZCS,C1=CC=C(C=C1)C2=C(C(=O)OC3=C2C=CC(=C3)O)C4=CC=...,314.094724,Neoflavonoids,Neoflavones,Phenylpropanoids and polyketides,C21H14O3,Neoflavonoids,Shikimates and Phenylpropanoids,Flavonoids
...,...,...,...,...,...,...,...,...,...,...
37806,ZZYXNRREDYWPLN,C1=CC(=C(N=C1)N)N,109.063998,Pyridines and derivatives,Aminopyridines and derivatives,Organoheterocyclic compounds,C5H7N3,Aminoacids,Amino acids and Peptides,Small peptides
37807,ZZZJZEXRSVMPPV,OCC[NH+](C(C1=CC=CC=C1)=O)CCN(CCN(CCNC(C)=O)C(...,421.518000,Benzene and substituted derivatives,Benzoic acids and derivatives,Benzenoids,C21H32N4O5,,,
37808,ZZZQXCUPAJFVBN,CC(C)C1=NC(=NC(=C1)C2=CC=C(C3=CC=CC=C32)F)N,281.132824,Naphthalenes,,Benzenoids,C17H16FN3,,Alkaloids,
37809,ZZZRUAITSXLWBH,CN(C)[C@@H]1C2CC3CC4=C(C=CC(=C4C(=O)C3C(=O)[C@...,457.193724,Tetracyclines,,Phenylpropanoids and polyketides,C23H27N3O7,Tetracyclines,Polyketides,Polycyclic aromatic polyketides


In [None]:
from rdkit import Chem

supplier = Chem.SmilesMolSupplier(file, smilesColumn=1, delimiter=",")
supplier[0]

In [31]:
# taken from: https://github.com/florian-huber/molecular_fingerprint_comparisons/blob/main/src/fingerprint_computation.py

import numpy as np
from rdkit import Chem
from tqdm import tqdm

def get_mol_from_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError("MolFromSmiles returned None with default sanitization.")
    except Exception as e:
        print(f"Error processing SMILES {smiles} with default sanitization: {e}")
        print("Retrying with sanitize=False...")
        try:
            mol = Chem.MolFromSmiles(smiles, sanitize=False)
            # Regenerate computed properties like implicit valence and ring information
            mol.UpdatePropertyCache(strict=False)

            # Apply several sanitization rules (taken from http://rdkit.org/docs/Cookbook.html)
            Chem.SanitizeMol(mol,Chem.SanitizeFlags.SANITIZE_FINDRADICALS|Chem.SanitizeFlags.SANITIZE_KEKULIZE\
                                |Chem.SanitizeFlags.SANITIZE_SETAROMATICITY|Chem.SanitizeFlags.SANITIZE_SETCONJUGATION\
                                |Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION|Chem.SanitizeFlags.SANITIZE_SYMMRINGS,
                                catchErrors=True)
            if mol is None:
                raise ValueError("MolFromSmiles returned None even with sanitize=False.")
        except Exception as e2:
            print(f"Error processing SMILES {smiles} with sanitize=False: {e2}")
            return None
    return mol

class FingerprintGenerator:
    def __init__(self, fpgen):
        self.fpgen = fpgen

    def fingerprint_from_smiles(self, smiles, count=False, bit_weighing=None):
        """Compute fingerprint from SMILES using the generator attribute.
        
        Parameters:
        smiles (str): The SMILES string of the molecule.
        count (bool): If True, returns the count fingerprint, else the regular fingerprint.

        Returns:
        np.array: The fingerprint as a NumPy array, or None if there's an error.
        """
        if (bit_weighing is not None) and not count:
            raise NotImplementedError("Weighing is currently only implemented for count vectors.")

        mol = get_mol_from_smiles(smiles)
        try:
            if count:
                return self.fpgen.GetCountFingerprintAsNumPy(mol)
            fp = self.fpgen.GetFingerprintAsNumPy(mol)
            if bit_weighing is None:
                return fp
            # elif bit_scaling.lower() == "log":
            #     return np.log(1 + fp)
            else:
                raise ValueError("Expected bit_scaling to be 'log' or 'None'.")
        except Exception as e:
            print(f"Error processing SMILES {smiles}: {e}")
            return None

def compute_fingerprints_from_smiles(
        smiles_lst,
        fpgen,
        count=True,
        # sparse=True,
        # bit_scaling=None,
        # bit_weighing=None,
        progress_bar=False,
        ):
    # if sparse:
    #     fp_generator = SparseFingerprintGenerator(fpgen)
    # else:
    fp_generator = FingerprintGenerator(fpgen)
    
    fingerprints = []
    for i, smiles in tqdm(enumerate(smiles_lst), total=len(smiles_lst), disable=(not progress_bar)):
        # if sparse:
        #     fp = fp_generator.fingerprint_from_smiles(smiles, count, bit_scaling, bit_weighing)
        # else:
        fp = fp_generator.fingerprint_from_smiles(smiles, count)#, bit_weighing)
        if fp is None:
            print(f"Missing fingerprint for element {i}: {smiles}")
        else:
            fingerprints.append(fp)
    # if sparse:
    #     return fingerprints
    return np.stack(fingerprints)

In [33]:
from rdkit.Chem import rdFingerprintGenerator

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=6)
fingerprints = compute_fingerprints_from_smiles(compounds.smiles, fpgen=mfpgen, count=True, progress_bar=True)

100%|██████████| 37811/37811 [00:17<00:00, 2151.90it/s]


In [36]:
fingerprints[0].nonzero()

(array([  15,   37,   41,   42,   46,   49,   52,   55,   80,  106,  157,
         166,  168,  197,  216,  224,  233,  259,  287,  294,  322,  326,
         337,  352,  368,  378,  379,  383,  394,  420,  424,  444,  461,
         482,  507,  520,  561,  582,  583,  631,  637,  650,  681,  695,
         703,  718,  724,  781,  807,  816,  841,  864,  875,  885,  896,
         917,  935,  938,  941,  947,  956, 1004, 1019, 1032, 1056, 1057,
        1070, 1100, 1146, 1152, 1160, 1164, 1185, 1192, 1216, 1295, 1333,
        1334, 1369, 1380, 1419, 1426, 1505, 1521, 1536, 1571, 1598, 1600,
        1624, 1663, 1683, 1696, 1705, 1710, 1718, 1722, 1726, 1731, 1750,
        1829, 1846, 1873, 1910, 1915, 1917, 1934, 1972, 1977, 1979, 1993,
        2004], dtype=int64),)

In [37]:
from matchms.similarity.FingerprintSimilarity import jaccard_similarity_matrix

similarity_matrix = jaccard_similarity_matrix(fingerprints, fingerprints)

  jaccard = np.nan_to_num(intersection / union)  # R,Q


In [None]:
import numpy as np
from rdkit.Chem import DataStructs
from rdkit.Chem.rdFingerprintGenerator import FingeprintGenerator64, FingeprintGenerator32
from typing import Callable

def numpy_fingerprint(mol: Chem.Mol, fp_function: Callable) -> np.ndarray: 
    if mol is None:
        return None
    fp = fp_function(mol)
    array = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, array)
    return array

In [None]:
import numpy
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import DataStructs

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=6)
np_fp = numpy_fingerprint(mol=supplier[0], fp_function=mfpgen.GetCountFingerprint)
for bit in np_fp.nonzero()[0]:
    print(np_fp[bit], bit)

In [None]:
fingerprints = np.zeros((len(supplier), mfpgen.GetNumBits()), dtype=np.int8)
for i, mol in enumerate(supplier):
    if mol is None:
        continue
    fingerprints[i] = mfpgen.GetCountFingerprintAsNumPy(mol)

numpy.ndarray