In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

In [2]:
class MoleculeLoader:
    """
    A class used to load and process molecular data.
    """
    def __init__(self, datafile: str, smi_col: str, scores_col: str) -> None:
        """
        Initializes the MoleculeLoader with a data file.

        Args:
            - datafile (str): The name of the data file to load.
            - smi_col (str): The name of the column containing SMILES strings in the `DataFrame`.
            - scores_col(str): The name of the column containing the docking scores in the `DataFrame`.
        """
        self.datafile = datafile
        self.smi_col = smi_col
        self.scores_col = scores_col
        self._df = None

    def _load_data(self) -> pd.DataFrame:
        """
        Loads data from the data file into a DataFrame.

        This method is called automatically when accessing the `df` property
        if the data has not been loaded yet.

        Returns:
            - pd.DataFrame: The loaded data.
        """
        self._df = pd.read_csv(self.datafile)

    def _process_data(self):
        """
        Processes the loaded DataFrame.

        If the 'molecule' column does not exist in the DataFrame,
        it is created by applying the `Chem.MolFromSmiles` function
        to the 'SMILES' column.
        """
        if 'molecules' not in self._df.columns:
            self._df['molecules'] = self._df[self.smi_col].apply(Chem.MolFromSmiles)

        self._df['scores'] = self._df[self.scores_col]
        self._df.drop(columns=[self.scores_col], inplace=True)

    @property
    def df(self):
        """
        Provides access to the loaded and processed DataFrame.

        If the data has not been loaded yet, it is loaded automatically
        the first time this property is accessed.

        Returns:
            - pd.DataFrame: The loaded DataFrame.
        """
        if self._df is None:
            self._load_data()
            self._process_data()
        return self._df

    def filter_by_mol_wt(self, MolWt: int):
        if self._df is None:
            raise ValueError("DataFrame is not loaded. Please load the data using .df on your object.")

        self._df = self._df[self._df['molecules'].apply(Descriptors.MolWt) < MolWt]

        return self

    def filter_by_num_atoms(self, max_atoms: int):
        if self._df is None:
            raise ValueError("DataFrame is not loaded. Please load the data using .df on your object.")
        
        self._df = self._df[self._df['molecules'].apply(lambda mol: mol.GetNumAtoms()) <= max_atoms]

        return self

In [3]:
loader = MoleculeLoader("10K.csv", 'SMILES', 'r_i_docking_score')

In [4]:
loader.df

Unnamed: 0.1,Unnamed: 0,i_i_glide_lignum,SMILES,molecules,scores
0,0,13477,C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H...,<rdkit.Chem.rdchem.Mol object at 0x00000250BA6...,-13.126800
1,1,516,O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1...,<rdkit.Chem.rdchem.Mol object at 0x00000250BA6...,-12.662900
2,2,9425,Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5...,<rdkit.Chem.rdchem.Mol object at 0x00000250BA6...,-12.487000
3,3,14512,Cc1cc(C)cc(C(=O)N2CCC[C@H](C(=O)NCc3cccc([C@@]...,<rdkit.Chem.rdchem.Mol object at 0x00000250BA6...,-12.483500
4,4,9615,CS(=O)(=O)c1ccc(F)c(C(=O)Nc2ccc(-c3nc(-c4ccccc...,<rdkit.Chem.rdchem.Mol object at 0x00000250BA6...,-12.478500
...,...,...,...,...,...
9893,9893,2451,O=C([O-])C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x00000250C54...,-0.560302
9894,9894,7813,Cc1cnn(C)c1S(=O)(=O)NC[C@@H](Cc1ccccn1)C(=O)[O-],<rdkit.Chem.rdchem.Mol object at 0x00000250C54...,-0.384887
9895,9895,11442,Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O,<rdkit.Chem.rdchem.Mol object at 0x00000250C54...,-0.269109
9896,9896,13971,O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1,<rdkit.Chem.rdchem.Mol object at 0x00000250C54...,-0.150473


In [5]:
loader.filter_by_mol_wt(300)

<__main__.MoleculeLoader at 0x250ba43baf0>

In [6]:
loader.filter_by_num_atoms(15)

<__main__.MoleculeLoader at 0x250ba43baf0>

In [7]:
loader.df

Unnamed: 0.1,Unnamed: 0,i_i_glide_lignum,SMILES,molecules,scores
7134,7134,8866,Cc1nc(NC(CO)CO)ccc1Br,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-7.02347
7162,7162,11230,[NH3+]CCc1c[nH]c2cc(Br)ccc12,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-7.01087
7764,7764,6645,Brc1c[nH]c(-c2nnc3n2CCCC3)c1,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-6.65524
7927,7927,1044,Cn1c(CO)nnc1[C@H]1CCC[NH2+]C1,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-6.52726
7968,7968,8297,CNC(=O)c1n[nH]c2ccc(Br)cc12,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-6.49883
8009,8009,18990,[NH3+][C@@H]1CCCN(C(=O)C2CCC2)C1,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-6.4763
8029,8029,1043,Cn1c(CO)nnc1[C@@H]1CCC[NH2+]C1,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-6.45889
8407,8407,18837,Cn1cnc2c1CC[NH2+]C21CCOCC1,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-6.19413
8428,8428,3079,CC(C)([NH3+])c1nc(-c2ccc(Br)o2)no1,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-6.17567
8566,8566,19574,C[C@@H]1C(=O)NCCN1c1ccc(Br)cn1,<rdkit.Chem.rdchem.Mol object at 0x00000250BA5...,-6.05745


In [8]:
loader2 = MoleculeLoader("10K.csv", 'SMILES', 'r_i_docking_score')

In [9]:
loader2.df

Unnamed: 0.1,Unnamed: 0,i_i_glide_lignum,SMILES,molecules,scores
0,0,13477,C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H...,<rdkit.Chem.rdchem.Mol object at 0x00000250C56...,-13.126800
1,1,516,O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1...,<rdkit.Chem.rdchem.Mol object at 0x00000250C56...,-12.662900
2,2,9425,Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5...,<rdkit.Chem.rdchem.Mol object at 0x00000250C56...,-12.487000
3,3,14512,Cc1cc(C)cc(C(=O)N2CCC[C@H](C(=O)NCc3cccc([C@@]...,<rdkit.Chem.rdchem.Mol object at 0x00000250C56...,-12.483500
4,4,9615,CS(=O)(=O)c1ccc(F)c(C(=O)Nc2ccc(-c3nc(-c4ccccc...,<rdkit.Chem.rdchem.Mol object at 0x00000250C56...,-12.478500
...,...,...,...,...,...
9893,9893,2451,O=C([O-])C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-0.560302
9894,9894,7813,Cc1cnn(C)c1S(=O)(=O)NC[C@@H](Cc1ccccn1)C(=O)[O-],<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-0.384887
9895,9895,11442,Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-0.269109
9896,9896,13971,O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-0.150473


In [10]:
loader2.filter_by_num_atoms(17).filter_by_mol_wt(300)

<__main__.MoleculeLoader at 0x250ba43bcd0>

In [11]:
loader2.df

Unnamed: 0.1,Unnamed: 0,i_i_glide_lignum,SMILES,molecules,scores
3888,3888,2813,O=C(NS(=O)(=O)[C@@H]1CCOC1)[C@H]1C[C@@H]1C1CC1,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-8.358960
3974,3974,9256,O=S(=O)(C[C@@H]1CCCO1)Nc1ccccc1O,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-8.329660
4543,4543,9255,O=C(c1ccc(Cl)cc1)N1CCCC[C@H]1CO,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-8.122710
4678,4678,4358,CC(C)(C)c1nsc(NCC2(O)CCCC2)n1,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-8.072930
5182,5182,1584,[NH3+]Cc1ccc(-c2cccc(C(=O)[O-])c2)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-7.886170
...,...,...,...,...,...
9865,9865,11634,CN(C(C)(C)C(=O)[O-])S(=O)(=O)c1ccnn1C,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-2.339150
9877,9877,8714,Cn1cc(S(=O)(=O)N2CC([NH3+])C2)cn1,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-2.049620
9889,9889,5148,C[NH+]1CCN(c2cnc(C(=O)[O-])cn2)CC1,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-1.058180
9893,9893,2451,O=C([O-])C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x00000250C55...,-0.560302
