NVIDIA · rilango · Feb 18, 2021 · Feb 13, 2021 · Feb 16, 2021 · Feb 17, 2021
diff --git a/.dockerignore b/.dockerignore
@@ -1,3 +1,25 @@
 ./data
 ./cache
 ./cddd
+
+.bash_history
+.cache
+.chembl_ws_client__0.10.2.sqlite
+.cheminf_local_environment
+.config
+.cudf
+.cupy
+dask-worker-space
+.flake8
+.git
+.gitignore
+.ipynb_checkpoints
+.ipython
+.jupyter
+.keras
+.local
+.npm
+.nv
+.pytest_cache
+.python_history
+.vscode
diff --git a/Dockerfile b/Dockerfile
@@ -1,30 +1,34 @@
 # Copyright 2020 NVIDIA Corporation
 # SPDX-License-Identifier: Apache-2.0
 FROM nvidia/cuda:11.0-base
-RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y wget git
+
+RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y wget git\
+    && rm -rf /var/lib/apt/lists/*
 
 SHELL ["/bin/bash", "-c"]
 RUN  wget --quiet -O /tmp/miniconda.sh \
     https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh \
     && /bin/bash /tmp/miniconda.sh -b -p /opt/conda \
     && rm /tmp/miniconda.sh \
     && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+
 ENV PATH /opt/conda/bin:$PATH
 
 # Copy conda env spec.
 COPY setup/cuchem_rapids_0.17.yml /tmp
 
-RUN conda env create --name cuchem -f /tmp/cuchem_rapids_0.17.yml
+RUN conda env create --name cuchem -f /tmp/cuchem_rapids_0.17.yml \
+    && rm /tmp/cuchem_rapids_0.17.yml\
+    && conda clean -ay
 ENV PATH /opt/conda/envs/cuchem/bin:$PATH
-RUN conda clean -afy
-RUN rm /tmp/cuchem_rapids_0.17.yml
 
 RUN source activate cuchem && python3 -m ipykernel install --user --name=cuchem
 RUN echo "source activate cuchem" > /etc/bash.bashrc
 
-COPY launch.sh /opt/nvidia/cheminfomatics/
-COPY *.py /opt/nvidia/cheminfomatics/
-COPY nbs/*.ipynb /opt/nvidia/cheminfomatics/
+RUN mkdir -p /opt/nvidia/ \
+    && cd /opt/nvidia/ \
+    && git clone https://github.com/NVIDIA/cheminformatics.git cheminfomatics \
+    && rm -rf /opt/nvidia/cheminfomatics/.git
 
 ENV UCX_LOG_LEVEL error
 

diff --git a/nvidia/cheminformatics/data/__init__.py b/nvidia/cheminformatics/data/__init__.py
@@ -5,16 +5,29 @@ class ClusterWfDAO(object):
     Base class for all DAO for fetching data for Clustering Workflows
     """
 
+    def meta_df(self):
+        """
+        Returns df with dtype set for structure without any column filter.
+        """
+        NotImplemented
+
     def fetch_molecular_embedding(self, n_molecules:int, cache_directory:str=None):
         """
         Fetch molecular properties from database/cache into a dask array.
         """
-        pass
+        NotImplemented
+
+    def fetch_molecular_embedding_by_id(self, molecule_id:List):
+        """
+        Fetch molecular properties from database for the given id. Id depends on
+        the backend databse. For chemble DB it should be molregid.
+        """
+        NotImplemented
 
-    def fetch_new_molecules(self, new_molecules: List):
+    def fetch_id_from_smile(self, new_molecules: List):
         """
         Fetch molecular details for a list of molecules. The values in the list
         of molecules depends on database/service used. For e.g. it could be
         ChemblId or molreg_id for Chemble database.
         """
-        pass
+        NotImplemented
diff --git a/nvidia/cheminformatics/data/cluster_wf.py b/nvidia/cheminformatics/data/cluster_wf.py
@@ -1,7 +1,11 @@
+from nvidia.cheminformatics.config import Context
+from nvidia.cheminformatics.utils.singleton import Singleton
 import os
+import dask_cudf
 import dask
 import logging
-import dask_cudf
+import sqlite3
+from contextlib import closing
 
 from typing import List
 
@@ -13,9 +17,23 @@
 FINGER_PRINT_FILES = 'filter_*.h5'
 
 
-class ChemblClusterWfDao(ClusterWfDAO):
+class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton):
+
+    def __init__(self):
+
+        context = Context()
+        db_file = context.get_config('data_mount_path', default='/data')
+
+        self.chembl_db = 'file:%s/db/chembl_27.db?mode=ro' % db_file
+        logger.info('Reading ChEMBL database at %s...' % self.chembl_db)
+
+    def meta_df(self):
+        chem_data = ChEmblData()
+        return chem_data._meta_df()
 
-    def fetch_molecular_embedding(self, n_molecules:int, cache_directory:str=None):
+    def fetch_molecular_embedding(self,
+                                  n_molecules:int,
+                                  cache_directory:str=None):
         chem_data = ChEmblData()
         if cache_directory:
             hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES)
@@ -30,5 +48,26 @@ def fetch_molecular_embedding(self, n_molecules:int, cache_directory:str=None):
 
         return mol_df
 
-    def fetch_new_molecules(self, new_molecules: List):
-        pass
+    def fetch_molecular_embedding_by_id(self, molecule_id:List):
+        chem_data = ChEmblData()
+        meta = chem_data._meta_df()
+        fp_df = chem_data._fetch_mol_embedding(molregnos=molecule_id) \
+                         .astype(meta.dtypes)
+
+        fp_df = dask.dataframe.from_pandas(fp_df, npartitions=1)
+        fp_df = dask_cudf.from_dask_dataframe(fp_df).reset_index()
+        return fp_df
+
+    def fetch_id_from_smile(self, new_molecules: List):
+        logger.debug('Fetch ChEMBL ID using molregno...')
+
+        with closing(sqlite3.connect(self.chembl_db, uri=True)) as con, con,  \
+                closing(con.cursor()) as cur:
+            select_stmt = '''
+                SELECT cs.molregno as molregno
+                FROM compound_structures cs
+                WHERE cs.canonical_smiles in (%s)
+            ''' %  "'%s'" %"','".join(new_molecules)
+            cur.execute(select_stmt)
+
+            return cur.fetchall()
diff --git a/nvidia/cheminformatics/data/helper/chembldata.py b/nvidia/cheminformatics/data/helper/chembldata.py
@@ -46,6 +46,8 @@
 ADDITIONAL_FEILD_TYPE = [pandas.Series([], dtype='object'),
                          pandas.Series([], dtype='object')]
 
+
+# DEPRECATED. Please add code to DAO classes.
 class ChEmblData(object, metaclass=Singleton):
 
     def __init__(self,
@@ -118,11 +120,21 @@ def fetch_molecule_cnt(self):
 
             return cur.fetchone()[0]
 
-    @delayed
+    def _meta_df(self, **transformation_kwargs):
+        transformation = self.fp_type(**transformation_kwargs)
+
+        prop_meta = {'id': pandas.Series([], dtype='int64')}
+        prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD,
+                              IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE)))
+        prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
+
+        return pandas.DataFrame(prop_meta)
+
     def _fetch_mol_embedding(self,
-                             start,
+                             start=0,
                              batch_size=5000,
                              smiles_transforms=SMILES_TRANSFORMS,
+                             molregnos=None,
                              **transformation_kwargs):
         """
         Returns compound properties and structure for the first N number of
@@ -132,15 +144,29 @@ def _fetch_mol_embedding(self,
         logger.info('Fetching %d records starting %d...' % (batch_size, start))
 
         imp_cols = [ 'cp.' + col for col in IMP_PROPS]
-        select_stmt = '''
-            SELECT md.molregno, %s, cs.canonical_smiles
-            FROM compound_properties cp,
-                 molecule_dictionary md,
-                 compound_structures cs
-            WHERE cp.molregno = md.molregno
-                  AND md.molregno = cs.molregno
-            LIMIT %d, %d
-        ''' % (', '.join(imp_cols), start, batch_size)
+
+        if molregnos is None:
+            select_stmt = '''
+                SELECT md.molregno, %s, cs.canonical_smiles
+                FROM compound_properties cp,
+                    molecule_dictionary md,
+                    compound_structures cs
+                WHERE cp.molregno = md.molregno
+                    AND md.molregno = cs.molregno
+                LIMIT %d, %d
+            ''' % (', '.join(imp_cols), start, batch_size)
+        else:
+            select_stmt = '''
+                SELECT md.molregno, %s, cs.canonical_smiles
+                FROM compound_properties cp,
+                    molecule_dictionary md,
+                    compound_structures cs
+                WHERE cp.molregno = md.molregno
+                    AND md.molregno = cs.molregno
+                    AND md.molregno in (%s)
+                LIMIT %d, %d
+            ''' % (', '.join(imp_cols), " ,".join(list(map(str, molregnos))), start, batch_size)
+
         df = pandas.read_sql(select_stmt,
                             sqlite3.connect(self.chembl_db, uri=True))
 
@@ -168,6 +194,7 @@ def _fetch_mol_embedding(self,
     def fetch_mol_embedding(self,
                             num_recs=None,
                             batch_size=5000,
+                            molregnos=None,
                             **transformation_kwargs):
         """
         Returns compound properties and structure for the first N number of
@@ -178,25 +205,18 @@ def fetch_mol_embedding(self,
         if not num_recs or num_recs < 0:
             num_recs = self.fetch_molecule_cnt()
 
-        transformation = self.fp_type(**transformation_kwargs)
-
-        prop_meta = {'id': pandas.Series([], dtype='int64')}
-        prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD,
-                              IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE)))
-        prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
+        meta_df = self._meta_df(**transformation_kwargs)
 
-        meta_df = pandas.DataFrame(prop_meta)
         dls = []
         for start in range(0, num_recs, batch_size):
             bsize = min(num_recs - start, batch_size)
-            dls.append(self._fetch_mol_embedding(
-                    start,
-                    batch_size=bsize,
-                    **transformation_kwargs))
+            dl_data = delayed(self._fetch_mol_embedding) \
+                             (start=start, batch_size=bsize, molregnos=molregnos, **transformation_kwargs)
+            dls.append(dl_data)
 
         return dataframe.from_delayed(dls, meta=meta_df)
 
-    def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None,):
+    def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None):
         """
         Generates fingerprints for all ChEMBL ID's in the database
         """