Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: Multiple bugs from first round of feedback... #27

Merged
merged 6 commits into from Feb 18, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 22 additions & 0 deletions .dockerignore
@@ -1,3 +1,25 @@
./data
./cache
./cddd

.bash_history
.cache
.chembl_ws_client__0.10.2.sqlite
.cheminf_local_environment
.config
.cudf
.cupy
dask-worker-space
.flake8
.git
.gitignore
.ipynb_checkpoints
.ipython
.jupyter
.keras
.local
.npm
.nv
.pytest_cache
.python_history
.vscode
18 changes: 11 additions & 7 deletions Dockerfile
@@ -1,30 +1,34 @@
# Copyright 2020 NVIDIA Corporation
# SPDX-License-Identifier: Apache-2.0
FROM nvidia/cuda:11.0-base
RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y wget git

RUN apt update && DEBIAN_FRONTEND=noninteractive apt-get install -y wget git\
&& rm -rf /var/lib/apt/lists/*

SHELL ["/bin/bash", "-c"]
RUN wget --quiet -O /tmp/miniconda.sh \
https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh \
&& /bin/bash /tmp/miniconda.sh -b -p /opt/conda \
&& rm /tmp/miniconda.sh \
&& ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh

ENV PATH /opt/conda/bin:$PATH

# Copy conda env spec.
COPY setup/cuchem_rapids_0.17.yml /tmp

RUN conda env create --name cuchem -f /tmp/cuchem_rapids_0.17.yml
RUN conda env create --name cuchem -f /tmp/cuchem_rapids_0.17.yml \
&& rm /tmp/cuchem_rapids_0.17.yml\
&& conda clean -ay
ENV PATH /opt/conda/envs/cuchem/bin:$PATH
RUN conda clean -afy
RUN rm /tmp/cuchem_rapids_0.17.yml

RUN source activate cuchem && python3 -m ipykernel install --user --name=cuchem
rilango marked this conversation as resolved.
Show resolved Hide resolved
RUN echo "source activate cuchem" > /etc/bash.bashrc

COPY launch.sh /opt/nvidia/cheminfomatics/
COPY *.py /opt/nvidia/cheminfomatics/
COPY nbs/*.ipynb /opt/nvidia/cheminfomatics/
RUN mkdir -p /opt/nvidia/ \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this was known, but the still seems to be missing most of the library files:

# ls /opt/nvidia/cheminfomatics/
Dockerfile  LICENSE  README.md  chemvisualize.py  demo.ipynb  icla.pdf  launch.sh  screenshot.jpg  startdash.py

&& cd /opt/nvidia/ \
&& git clone https://github.com/NVIDIA/cheminformatics.git cheminfomatics \
&& rm -rf /opt/nvidia/cheminfomatics/.git

ENV UCX_LOG_LEVEL error

Expand Down
19 changes: 16 additions & 3 deletions nvidia/cheminformatics/data/__init__.py
Expand Up @@ -5,16 +5,29 @@ class ClusterWfDAO(object):
Base class for all DAO for fetching data for Clustering Workflows
"""

def meta_df(self):
"""
Returns df with dtype set for structure without any column filter.
"""
NotImplemented

def fetch_molecular_embedding(self, n_molecules:int, cache_directory:str=None):
"""
Fetch molecular properties from database/cache into a dask array.
"""
pass
NotImplemented

def fetch_molecular_embedding_by_id(self, molecule_id:List):
"""
Fetch molecular properties from database for the given id. Id depends on
the backend databse. For chemble DB it should be molregid.
"""
NotImplemented

def fetch_new_molecules(self, new_molecules: List):
def fetch_id_from_smile(self, new_molecules: List):
"""
Fetch molecular details for a list of molecules. The values in the list
of molecules depends on database/service used. For e.g. it could be
ChemblId or molreg_id for Chemble database.
"""
pass
NotImplemented
49 changes: 44 additions & 5 deletions nvidia/cheminformatics/data/cluster_wf.py
@@ -1,7 +1,11 @@
from nvidia.cheminformatics.config import Context
from nvidia.cheminformatics.utils.singleton import Singleton
import os
import dask_cudf
import dask
import logging
import dask_cudf
import sqlite3
from contextlib import closing

from typing import List

Expand All @@ -13,9 +17,23 @@
FINGER_PRINT_FILES = 'filter_*.h5'


class ChemblClusterWfDao(ClusterWfDAO):
class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton):

def __init__(self):

context = Context()
db_file = context.get_config('data_mount_path', default='/data')

self.chembl_db = 'file:%s/db/chembl_27.db?mode=ro' % db_file
logger.info('Reading ChEMBL database at %s...' % self.chembl_db)

def meta_df(self):
chem_data = ChEmblData()
return chem_data._meta_df()

def fetch_molecular_embedding(self, n_molecules:int, cache_directory:str=None):
def fetch_molecular_embedding(self,
n_molecules:int,
cache_directory:str=None):
chem_data = ChEmblData()
if cache_directory:
hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES)
Expand All @@ -30,5 +48,26 @@ def fetch_molecular_embedding(self, n_molecules:int, cache_directory:str=None):

return mol_df

def fetch_new_molecules(self, new_molecules: List):
pass
def fetch_molecular_embedding_by_id(self, molecule_id:List):
chem_data = ChEmblData()
meta = chem_data._meta_df()
fp_df = chem_data._fetch_mol_embedding(molregnos=molecule_id) \
.astype(meta.dtypes)

fp_df = dask.dataframe.from_pandas(fp_df, npartitions=1)
fp_df = dask_cudf.from_dask_dataframe(fp_df).reset_index()
return fp_df

def fetch_id_from_smile(self, new_molecules: List):
logger.debug('Fetch ChEMBL ID using molregno...')

with closing(sqlite3.connect(self.chembl_db, uri=True)) as con, con, \
closing(con.cursor()) as cur:
select_stmt = '''
SELECT cs.molregno as molregno
FROM compound_structures cs
WHERE cs.canonical_smiles in (%s)
''' % "'%s'" %"','".join(new_molecules)
cur.execute(select_stmt)

return cur.fetchall()
66 changes: 43 additions & 23 deletions nvidia/cheminformatics/data/helper/chembldata.py
Expand Up @@ -46,6 +46,8 @@
ADDITIONAL_FEILD_TYPE = [pandas.Series([], dtype='object'),
pandas.Series([], dtype='object')]


# DEPRECATED. Please add code to DAO classes.
class ChEmblData(object, metaclass=Singleton):

def __init__(self,
Expand Down Expand Up @@ -118,11 +120,21 @@ def fetch_molecule_cnt(self):

return cur.fetchone()[0]

@delayed
def _meta_df(self, **transformation_kwargs):
transformation = self.fp_type(**transformation_kwargs)

prop_meta = {'id': pandas.Series([], dtype='int64')}
prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD,
IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE)))
prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))})

return pandas.DataFrame(prop_meta)

def _fetch_mol_embedding(self,
start,
start=0,
batch_size=5000,
smiles_transforms=SMILES_TRANSFORMS,
molregnos=None,
**transformation_kwargs):
"""
Returns compound properties and structure for the first N number of
Expand All @@ -132,15 +144,29 @@ def _fetch_mol_embedding(self,
logger.info('Fetching %d records starting %d...' % (batch_size, start))

imp_cols = [ 'cp.' + col for col in IMP_PROPS]
select_stmt = '''
SELECT md.molregno, %s, cs.canonical_smiles
FROM compound_properties cp,
molecule_dictionary md,
compound_structures cs
WHERE cp.molregno = md.molregno
AND md.molregno = cs.molregno
LIMIT %d, %d
''' % (', '.join(imp_cols), start, batch_size)

if molregnos is None:
select_stmt = '''
SELECT md.molregno, %s, cs.canonical_smiles
FROM compound_properties cp,
molecule_dictionary md,
compound_structures cs
WHERE cp.molregno = md.molregno
AND md.molregno = cs.molregno
LIMIT %d, %d
''' % (', '.join(imp_cols), start, batch_size)
else:
select_stmt = '''
SELECT md.molregno, %s, cs.canonical_smiles
FROM compound_properties cp,
molecule_dictionary md,
compound_structures cs
WHERE cp.molregno = md.molregno
AND md.molregno = cs.molregno
AND md.molregno in (%s)
LIMIT %d, %d
''' % (', '.join(imp_cols), " ,".join(list(map(str, molregnos))), start, batch_size)

df = pandas.read_sql(select_stmt,
sqlite3.connect(self.chembl_db, uri=True))

Expand Down Expand Up @@ -168,6 +194,7 @@ def _fetch_mol_embedding(self,
def fetch_mol_embedding(self,
num_recs=None,
batch_size=5000,
molregnos=None,
**transformation_kwargs):
"""
Returns compound properties and structure for the first N number of
Expand All @@ -178,25 +205,18 @@ def fetch_mol_embedding(self,
if not num_recs or num_recs < 0:
num_recs = self.fetch_molecule_cnt()

transformation = self.fp_type(**transformation_kwargs)

prop_meta = {'id': pandas.Series([], dtype='int64')}
prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD,
IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE)))
prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
meta_df = self._meta_df(**transformation_kwargs)

meta_df = pandas.DataFrame(prop_meta)
dls = []
for start in range(0, num_recs, batch_size):
bsize = min(num_recs - start, batch_size)
dls.append(self._fetch_mol_embedding(
start,
batch_size=bsize,
**transformation_kwargs))
dl_data = delayed(self._fetch_mol_embedding) \
(start=start, batch_size=bsize, molregnos=molregnos, **transformation_kwargs)
dls.append(dl_data)

return dataframe.from_delayed(dls, meta=meta_df)

def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None,):
def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None):
"""
Generates fingerprints for all ChEMBL ID's in the database
"""
Expand Down