Add a base dataset that can used multiprocessed preprocessing. (#184)

Signed-off-by: cfujitsang <cfujitsang@nvidia.com>
NVIDIAGameWorks · Mar 24, 2020 · fa17f79 · fa17f79
1 parent 1530b35
commit fa17f79
Show file tree

Hide file tree

Showing 5 changed files with 271 additions and 45 deletions.
diff --git a/.flake8 b/.flake8
@@ -17,7 +17,11 @@ exclude = .git, tests/, build/,
           examples/renderers/NMR,
           examples/SuperResolution,
           kaolin/cuda,
-          kaolin/datasets,
+          kaolin/datasets/modelnet.py,
+          kaolin/datasets/scannet.py,
+          kaolin/datasets/shapenet.py,
+          kaolin/datasets/shrec.py,
+          kaolin/datasets/usdfile.py,
           kaolin/engine,
           kaolin/graphics/dib_renderer,
           kaolin/graphics/DIBRenderer.py,

diff --git a/kaolin/datasets/base.py b/kaolin/datasets/base.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from tqdm import tqdm
+
+import torch
+from torch.multiprocessing import Pool
+from torch.utils.data import Dataset
+
+from kaolin import helpers
+
+def _preprocess_task(args):
+    torch.set_num_threads(1)
+    with torch.no_grad():
+        idx, get_data, get_attributes, cache_transform = args
+        name = get_attributes(idx)['name']
+        if name not in cache_transform.cached_ids:
+            data = get_data(idx)
+            cache_transform(name, *data)
+
+class KaolinDatasetMeta(type):
+    def __new__(metacls, cls_name, base_cls, class_dict):
+        if cls_name != "KaolinDataset":
+            class_dict['__doc__'] += \
+                """Additional args:
+        preprocessing_params (dict): parameters for the preprocessing:
+            - 'cache_dir': path to the cached preprocessed data.
+            - 'num_workers': number of process used in parallel for preprocessing (default: number of cores)
+        preprocessing_transform (Callable): Called on the outputs of _get_data over the indices
+                                            from 0 to len(self) during the construction of the dataset,
+                                            the preprocessed outputs are then cached to 'cache_dir'.
+        transform (Callable): Called on the preprocessed data at __getitem__.
+        no_progress (bool): disable tqdm progress bar for preprocessing."""
+        return type.__new__(metacls, cls_name, base_cls, class_dict)
+
+class KaolinDataset(Dataset, metaclass=KaolinDatasetMeta):
+    """
+    Abstract class for dataset with handling of multiprocess or cuda preprocessing.
+
+    A KaolinDataset children class will need the above implementation:
+       1) _initialize:
+           Initialization function called at the beginning of the constructor.
+       2) _get_data:
+           Data getter that will be preprocessed => cached => transformed, take an index as input.
+       3) _get_attributes:
+           Attributes getter that will be preprocess / transform independent.
+       4) __len__:
+           Return the size of the dataset
+    """
+    def __init__(self, *args, preprocessing_transform=None, preprocessing_params: dict = None,
+                 transform=None, no_progress: bool = False, **kwargs):
+        """
+        Args:
+            positional and keyword arguments for initialize(*args, **kwargs) (see class and initialize documentation)
+            preprocessing_params (dict): parameters for the preprocessing:
+                - 'cache_dir': path to the cached preprocessed data.
+                - 'num_workers': number of process used in parallel for preprocessing (default: number of cores)
+            preprocessing_transform (Callable): Called on the outputs of _get_data over the indices
+                                                from 0 to len(self) during the construction of the dataset,
+                                                the preprocessed outputs are then cached to 'cache_dir'.
+            transform (Callable): Called on the preprocessed data at __getitem__.
+            no_progress (bool): disable tqdm progress bar for preprocessing.
+        """
+        self.initialize(*args, **kwargs)
+        if preprocessing_transform is not None:
+            desc = 'applying preprocessing'
+            if preprocessing_params is None:
+                preprocessing_params = {}
+            assert preprocessing_params.get('cache_dir') is not None
+            self.cache_convert = helpers.Cache(
+                preprocessing_transform, preprocessing_params['cache_dir'],
+                cache_key=helpers._get_hash(repr(preprocessing_transform)))
+            if preprocessing_params.get('use_cuda') is None:
+                preprocessing_params['use_cuda'] = False
+            num_workers = preprocessing_params.get('num_workers')
+            if num_workers == 0:
+                with torch.no_grad():
+                    for idx in tqdm(range(len(self)), desc=desc, disable=no_progress):
+                        name = self._get_attributes(idx)['name']
+                        if name not in self.cache_convert.cached_ids:
+                            data = self._get_data(idx)
+                            self.cache_convert(name, *data)
+            else:
+                p = Pool(num_workers)
+                iterator = p.imap_unordered(
+                    _preprocess_task,
+                    [(idx, self._get_data, self._get_attributes, self.cache_convert)
+                     for idx in range(len(self))])
+                for i in tqdm(range(len(self)), desc=desc, disable=no_progress):
+                    next(iterator)
+        else:
+            self.cache_convert = None
+        self.transform = transform
+
+    def __getitem__(self, index):
+        """Returns the item at index idx. """
+        attributes = self._get_attributes(index)
+        data = (self.cache_convert(attributes['name']) if self.cache_convert is not None else
+                self._get_data(index))
+        if self.transform is not None:
+            data = self.transform(data)
+        return {'data': data, 'attributes': attributes}
+
+    @abstractmethod
+    def initialize(self, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def _get_attributes(self, index):
+        pass
+
+    @abstractmethod
+    def _get_data(self, index):
+        pass
+
+    @abstractmethod
+    def __len__(self):
+        pass
+
+class CombinationDataset(KaolinDataset):
+    """Dataset combining a list of datasets into a unified dataset object.
+    Useful when multiple output representations are needed from a common base representation
+    (Eg. when a mesh is to be served as both a pointcloud and a voxelgrid, etc.)
+    the output of _get_attributes will be a tuple of all the _get_attributes of the dataset list
+    the output of _get_data wiil be a tuple of all the 'data' of the __getitem__ of the dataset list
+
+    Args:
+        datasets: list or tuple of KaolinDataset
+    """
+    def _initialize(self, datasets):
+        self.len = len(datasets[0])
+        for i, d in enumerate(datasets):
+            assert len(d) == self.len, \
+                f"All datasets must have the same length. Invalid length at index {i} (expected: {self.len}, got: {len(d)})"
+        self.datasets = datasets
+
+    def __len__(self):
+        return self.len
+
+    def _get_attributes(self, index):
+        return (d._get_attributes(index) for d in self.datasets)
+
+    def _get_data(self, index):
+        return (d[index]['data'] for d in self.datasets)
diff --git a/kaolin/datasets/shapenet.py b/kaolin/datasets/shapenet.py
@@ -43,6 +43,7 @@
 from kaolin import helpers
 import kaolin.conversions.meshconversions as mesh_cvt
 
+from .base import KaolinDataset
 
 # Synset to Label mapping (for ShapeNet core classes)
 synset_to_label = {'04379243': 'table', '03211117': 'monitor', '04401088': 'phone',
@@ -73,37 +74,6 @@ def __exit__(self, *args):
         self.logger("\t[done]\n")
 
 
-def tqdm_hook(t, timeout=1):
-    """Taken from https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py
-
-    Wraps tqdm instance.
-    Don't forget to close() or __exit__()
-    the tqdm instance once you're done with it (easiest using `with` syntax).
-    Example
-    -------
-    >>> with tqdm(...) as t:
-    ...     reporthook = my_hook(t)
-    ...     urllib.request.urlretrieve(..., reporthook=reporthook)
-    """
-    last_b = [0]
-
-    def update_to(b=1, bsize=1, tsize=None):
-        """
-        b  : int, optional
-            Number of blocks transferred so far [default: 1].
-        bsize  : int, optional
-            Size of each block (in tqdm units) [default: 1].
-        tsize  : int, optional
-            Total size (in tqdm units). If [default: None] remains unchanged.
-        """
-        if tsize is not None:
-            t.total = tsize
-        t.update((b - last_b[0]) * bsize)
-        last_b[0] = b
-
-    return update_to
-
-
 def _convert_categories(categories):
     assert categories is not None, 'List of categories cannot be empty!'
     if not (c in synset_to_label.keys() + label_to_synset.keys()
@@ -117,24 +87,21 @@ def _convert_categories(categories):
 
 class ShapeNet_Meshes(data.Dataset):
     r"""ShapeNet Dataset class for meshes.
-
     Args:
         root (str): Path to the root directory of the ShapeNet dataset.
         categories (str): List of categories to load from ShapeNet. This list may
                 contain synset ids, class label names (for ShapeNetCore classes),
                 or a combination of both.
-        train (bool): return the training set else the test set
-        split (float): amount of dataset that is training out of 1
+        train (bool): If True, return the training set, otherwise the test set
+        split (float): fraction of the dataset to be used for training (>=0 and <=1)
         no_progress (bool): if True, disables progress bar
-
     Returns:
         .. code-block::
 
-        dict: {
-            attributes: {name: str, path: str, synset: str, label: str},
-            data: {vertices: torch.Tensor, faces: torch.Tensor}
-        }
-
+           dict: {
+               attributes: {name: str, path: str, synset: str, label: str},
+               data: {vertices: torch.Tensor, faces: torch.Tensor}
+           }
     Example:
         >>> meshes = ShapeNet_Meshes(root='../data/ShapeNet/')
         >>> obj = next(iter(meshes))
@@ -143,7 +110,6 @@ class ShapeNet_Meshes(data.Dataset):
         >>> obj['data']['faces'].shape
         torch.Size([1910, 3])
     """
-
     def __init__(self, root: str, categories: list = ['chair'], train: bool = True,
                  split: float = .7, no_progress: bool = False):
         self.root = Path(root)
@@ -193,6 +159,86 @@ def __getitem__(self, index):
         return {'data': data, 'attributes': attributes}
 
 
+class ShapeNet(KaolinDataset):
+    r"""ShapeNetV1 Dataset class for meshes.
+    Args:
+        root (str): path to ShapeNet root directory
+        categories (list): List of categories to load from ShapeNet. This list may
+                           contain synset ids, class label names (for ShapeNetCore classes),
+                           or a combination of both.
+        train (bool): If True, return the training set, otherwise the test set
+        split (float): fraction of the dataset to be used for training (>=0 and <=1)
+    Returns:
+        .. code-block::
+
+           dict: {
+                attributes: {name: str, path: str, synset: str, label: str},
+                data: {vertices: torch.Tensor, faces: torch.Tensor}
+           }
+    Example:
+        >>> meshes = ShapeNet(root='../data/ShapeNet/')
+        >>> obj = meshes[0]
+        >>> obj['data'].vertices.shape
+        torch.Size([2133, 3])
+        >>> obj['data'].faces.shape
+        torch.Size([1910, 3])
+    """
+
+    def initialize(self, root: str, categories: list, train: bool = True, split: float = .7):
+        """Initialize the dataset
+        Args:
+            root (str): path to ShapeNet root directory
+            categories (list): List of categories to load from ShapeNet. This list may
+                               contain synset ids, class label names (for ShapeNetCore classes),
+                               or a combination of both.
+            train (bool): If True, return the training set, otherwise the test set
+            split (float): fraction of the dataset to be used for training (>=0 and <=1)"""
+        self.root = Path(root)
+        self.paths = []
+        self.synset_idxs = []
+        self.synsets = _convert_categories(categories)
+        self.labels = [synset_to_label[s] for s in self.synsets]
+
+        # loops through desired classes
+        for i in range(len(self.synsets)):
+            syn = self.synsets[i]
+            class_target = self.root / syn
+            if not class_target.exists():
+                raise ValueError('Class {0} ({1}) was not found at location {2}.'.format(
+                    syn, self.labels[i], str(class_target)))
+
+            # find all objects in the class
+            models = sorted(class_target.glob('*'))
+            stop = int(len(models) * split)
+            if train:
+                models = models[:stop]
+            else:
+                models = models[stop:]
+            self.paths += models
+            self.synset_idxs += [i] * len(models)
+
+        self.names = [p.name for p in self.paths]
+
+    def __len__(self):
+        """Returns the length of the dataset. """
+        return len(self.paths)
+
+    def _get_data(self, index):
+        synset_idx = self.synset_idxs[index]
+        obj_location = self.paths[index] / 'model.obj'
+        mesh = TriangleMesh.from_obj(str(obj_location))
+        return (mesh,)
+
+    def _get_attributes(self, index):
+        synset_idx = self.synset_idxs[index]
+        attributes = {
+            'name': self.names[index],
+            'path': self.paths[index] / 'model.obj',
+            'synset': self.synsets[synset_idx],
+            'label': self.labels[synset_idx]
+        }
+        return attributes
+
 class ShapeNet_Images(data.Dataset):
     r"""ShapeNet Dataset class for images.
 

diff --git a/kaolin/helpers.py b/kaolin/helpers.py
@@ -21,7 +21,6 @@
 from typing import Callable
 import numpy as np
 
-
 def _composedecorator(*decs):
     """Returns a composition of several decorators.
 
@@ -193,7 +192,7 @@ def __init__(self, func: Callable, cache_dir: [str, Path], cache_key: str):
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         self.cached_ids = [p.stem for p in self.cache_dir.glob('*')]
 
-    def __call__(self, unique_id: str, **kwargs):
+    def __call__(self, unique_id: str, *args, **kwargs):
         """Execute self.func if not cached, otherwise, read data from disk.
 
             Args:
@@ -207,7 +206,7 @@ def __call__(self, unique_id: str, **kwargs):
         fpath = self.cache_dir / f'{unique_id}.p'
 
         if not fpath.exists():
-            output = self.func(**kwargs)
+            output = self.func(*args, **kwargs)
             self._write(output, fpath)
             self.cached_ids.append(unique_id)
         else:

diff --git a/kaolin/rep/Mesh.py b/kaolin/rep/Mesh.py
@@ -967,3 +967,24 @@ def compute_interior_angles_per_edge(self):
 
     def compute_dihedral_angles_per_edge(self):
         raise NotImplementedError
+
+    def __getstate__(self):
+        outputs = {'vertices': self.vertices,
+                   'faces': self.faces}
+        if self.uvs is not None:
+            outputs['uvs'] = self.uvs
+        if self.face_textures is not None:
+            outputs['face_textures'] = self.face_textures
+        if self.textures is not None:
+            outputs['textures'] = self.textures
+        return outputs
+
+    def __setstate__(self, args):
+        self.vertices = args['vertices']
+        self.faces = args['faces']
+        if 'uvs' in args:
+            self.uvs = args['uvs']
+        if 'face_textures' in args:
+            self.face_textures = args['face_textures']
+        if 'textures' in args:
+            self.textures = args['textures']