In [183]:
import pandas as pd
import numpy as np
from typing import Optional

from rdkit import Chem
from rdkit.Chem import Descriptors, DataStructs

from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.Fingerprints import FingerprintMols
import pickle

In [184]:
class MoleculeLoader:
    """
    A class used to load and process molecular data.
    """
    def __init__(self, datafile: str, smi_col: str, scores_col: str) -> None:
        """
        Initializes the MoleculeLoader with a data file.

        Args:
            - datafile (str): The name of the data file to load.
            - smi_col (str): The name of the column containing SMILES strings in the `DataFrame`.
            - scores_col(str): The name of the column containing the docking scores in the `DataFrame`.
        """
        self.datafile = datafile
        self.smi_col = smi_col
        self.scores_col = scores_col
        self._df = None

    def _load_data(self) -> pd.DataFrame:
        """
        Loads data from the data file into a DataFrame.

        This method is called automatically when accessing the `df` property
        if the data has not been loaded yet.

        Returns:
            - pd.DataFrame: The loaded data.
        """
        self._df = pd.read_csv(self.datafile)

    def _process_data(self):
        """
        Processes the loaded DataFrame.
        """
        if 'molecules' not in self._df.columns:
            self._df['molecules'] = self._df[self.smi_col].apply(Chem.MolFromSmiles)

        self._df.rename(columns={self.scores_col: "scores"}, inplace=True)
        self._df.rename(columns={self.smi_col: "SMILES"}, inplace=True)

        self._df.sort_values(by="scores")

    @property
    def df(self):
        """
        Provides access to the loaded and processed DataFrame.

        If the data has not been loaded yet, it is loaded automatically
        the first time this property is accessed.

        Returns:
            - pd.DataFrame: The loaded DataFrame.
        """
        if self._df is None:
            self._load_data()
            self._process_data()
        return self._df

    def filter_by_mol_wt(self, MolWt: int):
        if self._df is None:
            raise ValueError("DataFrame is not loaded. Please load the data using .df on your object.")

        self._df = self._df[self._df['molecules'].apply(Descriptors.MolWt) < MolWt]

        return self

    def filter_by_num_atoms(self, max_atoms: int):
        if self._df is None:
            raise ValueError("DataFrame is not loaded. Please load the data using .df on your object.")
        
        self._df = self._df[self._df['molecules'].apply(lambda mol: mol.GetNumAtoms()) <= max_atoms]

        return self

In [185]:
loader = MoleculeLoader("10K.csv", 'SMILES', 'r_i_docking_score')

In [186]:
loader.df

Unnamed: 0.1,Unnamed: 0,i_i_glide_lignum,scores,SMILES,molecules
0,0,13477,-13.126800,C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...
1,1,516,-12.662900,O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...
2,2,9425,-12.487000,Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...
3,3,14512,-12.483500,Cc1cc(C)cc(C(=O)N2CCC[C@H](C(=O)NCc3cccc([C@@]...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...
4,4,9615,-12.478500,CS(=O)(=O)c1ccc(F)c(C(=O)Nc2ccc(-c3nc(-c4ccccc...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...
...,...,...,...,...,...
9893,9893,2451,-0.560302,O=C([O-])C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x00000266EC4...
9894,9894,7813,-0.384887,Cc1cnn(C)c1S(=O)(=O)NC[C@@H](Cc1ccccn1)C(=O)[O-],<rdkit.Chem.rdchem.Mol object at 0x00000266EC4...
9895,9895,11442,-0.269109,Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O,<rdkit.Chem.rdchem.Mol object at 0x00000266EC4...
9896,9896,13971,-0.150473,O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1,<rdkit.Chem.rdchem.Mol object at 0x00000266EC4...


In [187]:
loader2 = MoleculeLoader("10K.csv", 'SMILES', 'r_i_docking_score')

In [188]:
loader2.df

Unnamed: 0.1,Unnamed: 0,i_i_glide_lignum,scores,SMILES,molecules
0,0,13477,-13.126800,C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H...,<rdkit.Chem.rdchem.Mol object at 0x00000266E90...
1,1,516,-12.662900,O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1...,<rdkit.Chem.rdchem.Mol object at 0x00000266E90...
2,2,9425,-12.487000,Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5...,<rdkit.Chem.rdchem.Mol object at 0x00000266E90...
3,3,14512,-12.483500,Cc1cc(C)cc(C(=O)N2CCC[C@H](C(=O)NCc3cccc([C@@]...,<rdkit.Chem.rdchem.Mol object at 0x00000266E90...
4,4,9615,-12.478500,CS(=O)(=O)c1ccc(F)c(C(=O)Nc2ccc(-c3nc(-c4ccccc...,<rdkit.Chem.rdchem.Mol object at 0x00000266E90...
...,...,...,...,...,...
9893,9893,2451,-0.560302,O=C([O-])C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
9894,9894,7813,-0.384887,Cc1cnn(C)c1S(=O)(=O)NC[C@@H](Cc1ccccn1)C(=O)[O-],<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
9895,9895,11442,-0.269109,Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
9896,9896,13971,-0.150473,O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...


In [189]:
loader2.filter_by_num_atoms(17).filter_by_mol_wt(300)

<__main__.MoleculeLoader at 0x266e914fe80>

In [190]:
loader2.df

Unnamed: 0.1,Unnamed: 0,i_i_glide_lignum,scores,SMILES,molecules
3888,3888,2813,-8.358960,O=C(NS(=O)(=O)[C@@H]1CCOC1)[C@H]1C[C@@H]1C1CC1,<rdkit.Chem.rdchem.Mol object at 0x00000266E70...
3974,3974,9256,-8.329660,O=S(=O)(C[C@@H]1CCCO1)Nc1ccccc1O,<rdkit.Chem.rdchem.Mol object at 0x00000266E70...
4543,4543,9255,-8.122710,O=C(c1ccc(Cl)cc1)N1CCCC[C@H]1CO,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
4678,4678,4358,-8.072930,CC(C)(C)c1nsc(NCC2(O)CCCC2)n1,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
5182,5182,1584,-7.886170,[NH3+]Cc1ccc(-c2cccc(C(=O)[O-])c2)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
...,...,...,...,...,...
9865,9865,11634,-2.339150,CN(C(C)(C)C(=O)[O-])S(=O)(=O)c1ccnn1C,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
9877,9877,8714,-2.049620,Cn1cc(S(=O)(=O)N2CC([NH3+])C2)cn1,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
9889,9889,5148,-1.058180,C[NH+]1CCN(c2cnc(C(=O)[O-])cn2)CC1,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...
9893,9893,2451,-0.560302,O=C([O-])C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x00000266F68...


In [210]:
class Featurizer:
    """
    A class to featurize molecules in a DataFrame.
    """
    def __init__(self, df: pd.DataFrame = None, mol_col: str = None) -> None:
        """
        Initializes the Featurizer with a DataFrame and the name of the column containing molecules.

        Args:
            df (pd.DataFrame): The DataFrame to featurize.
            mol_col (str): The name of the column containing molecules to featurize.
        """
        self.df = df
        self.mol_col = mol_col
        self.smi_col = 'SMILES'
        self.features = None

    def featurize(self, method: str, **kwargs) -> None:
        """
        Featurizes the molecules in the DataFrame using the specified method and stores the features separately.

        Args:
            method (str): The featurization method to use. Supported methods are 'morgan' and 'topological'.
            **kwargs: Additional keyword arguments to pass to the featurization method.
        """
        if method == 'morgan':
            # self.df['features'] = self.df[self.mol_col].apply(lambda mol: np.array(list(AllChem.GetMorganFingerprintAsBitVect(mol, **kwargs))))
            self.df['features'] = self.df[self.mol_col].apply(lambda mol: self._convert_to_np_array(AllChem.GetMorganFingerprintAsBitVect(mol, **kwargs)))
        elif method == 'topological':
            self.df['features'] = self.df[self.mol_col].apply(lambda mol: self._convert_to_np_array(FingerprintMols.FingerprintMol(mol, **kwargs)))
        else:
            raise ValueError(f"Unsupported featurization method: {method}")
        
    def _convert_to_np_array(self, bit_vect) -> np.ndarray:
        """
        Converts an RDKit explicit bit vector to a numpy array.

        Args:
            bit_vect: The bit vector to convert.

        Returns:
            np.ndarray: The converted numpy array.
        """
        np_array = np.zeros((bit_vect.GetNumBits(),), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(bit_vect, np_array)
        return np_array


    def save(self, filename: str) -> None:
        """
        Saves the featurized DataFrame to a pickle file.

        Args:
            filename (str): The name of the file to save the featurized DataFrame to.
        """
        with open(filename, 'wb') as f:
            pickle.dump(self.df, f)

    @classmethod
    def load(cls, filename: str) -> 'Featurizer':
        """
        Loads a featurized DataFrame from a pickle file.

        Args:
            filename (str): The name of the file to load the featurized DataFrame from.

        Returns:
            Featurizer: A Featurizer instance with the loaded DataFrame.
        """
        with open(filename, 'rb') as f:
            features = pickle.load(f)
        instance = cls()
        instance.features = features
        return instance
    
    def inspect_features(self):
        """
        Returns the featurized molecules for inspection.

        Returns:
            The featurized molecules.
        """
        if 'features' in self.df.columns:
            return self.df
        else:
            print("No features available. Please run the featurize method first.")

    def inspect_features_by_smiles(self, smiles: str) -> Optional[np.ndarray]:
        """
        Inspects the features for a specific molecule based on its SMILES representation.

        Args:
            smiles (str): The SMILES string for the molecule to inspect.

        Returns:
            np.ndarray: The feature vector for the molecule, or None if the molecule is not found.
        """
        index = self.df[self.df[self.smi_col] == smiles].index
        if not index.empty:
            fingerprint = self.df['features'][index[0]]
            return fingerprint
        else:
            print(f"No molecule with SMILES {smiles} found in the DataFrame.")
            return None

In [211]:
my_features = Featurizer(loader.df, 'molecules')

In [223]:
my_features.featurize('morgan', radius=2, nBits=64)

In [229]:
my_features.inspect_features()

Unnamed: 0.1,Unnamed: 0,i_i_glide_lignum,scores,SMILES,molecules,features
0,0,13477,-13.126800,C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...,"[1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, ..."
1,1,516,-12.662900,O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...,"[1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, ..."
2,2,9425,-12.487000,Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...,"[1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, ..."
3,3,14512,-12.483500,Cc1cc(C)cc(C(=O)N2CCC[C@H](C(=O)NCc3cccc([C@@]...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...,"[1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, ..."
4,4,9615,-12.478500,CS(=O)(=O)c1ccc(F)c(C(=O)Nc2ccc(-c3nc(-c4ccccc...,<rdkit.Chem.rdchem.Mol object at 0x00000266F67...,"[1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, ..."
...,...,...,...,...,...,...
9893,9893,2451,-0.560302,O=C([O-])C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x00000266EC4...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, ..."
9894,9894,7813,-0.384887,Cc1cnn(C)c1S(=O)(=O)NC[C@@H](Cc1ccccn1)C(=O)[O-],<rdkit.Chem.rdchem.Mol object at 0x00000266EC4...,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."
9895,9895,11442,-0.269109,Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O,<rdkit.Chem.rdchem.Mol object at 0x00000266EC4...,"[1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, ..."
9896,9896,13971,-0.150473,O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1,<rdkit.Chem.rdchem.Mol object at 0x00000266EC4...,"[1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, ..."


In [225]:
my_features.inspect_features_by_smiles("Cc1nc(NC(CO)CO)ccc1Br")

array([1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1],
      dtype=int8)