Skip to content

Commit

Permalink
Add a base dataset that can used multiprocessed preprocessing. (#184)
Browse files Browse the repository at this point in the history
Signed-off-by: cfujitsang <cfujitsang@nvidia.com>
  • Loading branch information
Caenorst committed Mar 24, 2020
1 parent 1530b35 commit fa17f79
Show file tree
Hide file tree
Showing 5 changed files with 271 additions and 45 deletions.
6 changes: 5 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ exclude = .git, tests/, build/,
examples/renderers/NMR,
examples/SuperResolution,
kaolin/cuda,
kaolin/datasets,
kaolin/datasets/modelnet.py,
kaolin/datasets/scannet.py,
kaolin/datasets/shapenet.py,
kaolin/datasets/shrec.py,
kaolin/datasets/usdfile.py,
kaolin/engine,
kaolin/graphics/dib_renderer,
kaolin/graphics/DIBRenderer.py,
Expand Down
156 changes: 156 additions & 0 deletions kaolin/datasets/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod
from tqdm import tqdm

import torch
from torch.multiprocessing import Pool
from torch.utils.data import Dataset

from kaolin import helpers

def _preprocess_task(args):
torch.set_num_threads(1)
with torch.no_grad():
idx, get_data, get_attributes, cache_transform = args
name = get_attributes(idx)['name']
if name not in cache_transform.cached_ids:
data = get_data(idx)
cache_transform(name, *data)

class KaolinDatasetMeta(type):
def __new__(metacls, cls_name, base_cls, class_dict):
if cls_name != "KaolinDataset":
class_dict['__doc__'] += \
"""Additional args:
preprocessing_params (dict): parameters for the preprocessing:
- 'cache_dir': path to the cached preprocessed data.
- 'num_workers': number of process used in parallel for preprocessing (default: number of cores)
preprocessing_transform (Callable): Called on the outputs of _get_data over the indices
from 0 to len(self) during the construction of the dataset,
the preprocessed outputs are then cached to 'cache_dir'.
transform (Callable): Called on the preprocessed data at __getitem__.
no_progress (bool): disable tqdm progress bar for preprocessing."""
return type.__new__(metacls, cls_name, base_cls, class_dict)

class KaolinDataset(Dataset, metaclass=KaolinDatasetMeta):
"""
Abstract class for dataset with handling of multiprocess or cuda preprocessing.
A KaolinDataset children class will need the above implementation:
1) _initialize:
Initialization function called at the beginning of the constructor.
2) _get_data:
Data getter that will be preprocessed => cached => transformed, take an index as input.
3) _get_attributes:
Attributes getter that will be preprocess / transform independent.
4) __len__:
Return the size of the dataset
"""
def __init__(self, *args, preprocessing_transform=None, preprocessing_params: dict = None,
transform=None, no_progress: bool = False, **kwargs):
"""
Args:
positional and keyword arguments for initialize(*args, **kwargs) (see class and initialize documentation)
preprocessing_params (dict): parameters for the preprocessing:
- 'cache_dir': path to the cached preprocessed data.
- 'num_workers': number of process used in parallel for preprocessing (default: number of cores)
preprocessing_transform (Callable): Called on the outputs of _get_data over the indices
from 0 to len(self) during the construction of the dataset,
the preprocessed outputs are then cached to 'cache_dir'.
transform (Callable): Called on the preprocessed data at __getitem__.
no_progress (bool): disable tqdm progress bar for preprocessing.
"""
self.initialize(*args, **kwargs)
if preprocessing_transform is not None:
desc = 'applying preprocessing'
if preprocessing_params is None:
preprocessing_params = {}
assert preprocessing_params.get('cache_dir') is not None
self.cache_convert = helpers.Cache(
preprocessing_transform, preprocessing_params['cache_dir'],
cache_key=helpers._get_hash(repr(preprocessing_transform)))
if preprocessing_params.get('use_cuda') is None:
preprocessing_params['use_cuda'] = False
num_workers = preprocessing_params.get('num_workers')
if num_workers == 0:
with torch.no_grad():
for idx in tqdm(range(len(self)), desc=desc, disable=no_progress):
name = self._get_attributes(idx)['name']
if name not in self.cache_convert.cached_ids:
data = self._get_data(idx)
self.cache_convert(name, *data)
else:
p = Pool(num_workers)
iterator = p.imap_unordered(
_preprocess_task,
[(idx, self._get_data, self._get_attributes, self.cache_convert)
for idx in range(len(self))])
for i in tqdm(range(len(self)), desc=desc, disable=no_progress):
next(iterator)
else:
self.cache_convert = None
self.transform = transform

def __getitem__(self, index):
"""Returns the item at index idx. """
attributes = self._get_attributes(index)
data = (self.cache_convert(attributes['name']) if self.cache_convert is not None else
self._get_data(index))
if self.transform is not None:
data = self.transform(data)
return {'data': data, 'attributes': attributes}

@abstractmethod
def initialize(self, *args, **kwargs):
pass

@abstractmethod
def _get_attributes(self, index):
pass

@abstractmethod
def _get_data(self, index):
pass

@abstractmethod
def __len__(self):
pass

class CombinationDataset(KaolinDataset):
"""Dataset combining a list of datasets into a unified dataset object.
Useful when multiple output representations are needed from a common base representation
(Eg. when a mesh is to be served as both a pointcloud and a voxelgrid, etc.)
the output of _get_attributes will be a tuple of all the _get_attributes of the dataset list
the output of _get_data wiil be a tuple of all the 'data' of the __getitem__ of the dataset list
Args:
datasets: list or tuple of KaolinDataset
"""
def _initialize(self, datasets):
self.len = len(datasets[0])
for i, d in enumerate(datasets):
assert len(d) == self.len, \
f"All datasets must have the same length. Invalid length at index {i} (expected: {self.len}, got: {len(d)})"
self.datasets = datasets

def __len__(self):
return self.len

def _get_attributes(self, index):
return (d._get_attributes(index) for d in self.datasets)

def _get_data(self, index):
return (d[index]['data'] for d in self.datasets)
128 changes: 87 additions & 41 deletions kaolin/datasets/shapenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from kaolin import helpers
import kaolin.conversions.meshconversions as mesh_cvt

from .base import KaolinDataset

# Synset to Label mapping (for ShapeNet core classes)
synset_to_label = {'04379243': 'table', '03211117': 'monitor', '04401088': 'phone',
Expand Down Expand Up @@ -73,37 +74,6 @@ def __exit__(self, *args):
self.logger("\t[done]\n")


def tqdm_hook(t, timeout=1):
"""Taken from https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py
Wraps tqdm instance.
Don't forget to close() or __exit__()
the tqdm instance once you're done with it (easiest using `with` syntax).
Example
-------
>>> with tqdm(...) as t:
... reporthook = my_hook(t)
... urllib.request.urlretrieve(..., reporthook=reporthook)
"""
last_b = [0]

def update_to(b=1, bsize=1, tsize=None):
"""
b : int, optional
Number of blocks transferred so far [default: 1].
bsize : int, optional
Size of each block (in tqdm units) [default: 1].
tsize : int, optional
Total size (in tqdm units). If [default: None] remains unchanged.
"""
if tsize is not None:
t.total = tsize
t.update((b - last_b[0]) * bsize)
last_b[0] = b

return update_to


def _convert_categories(categories):
assert categories is not None, 'List of categories cannot be empty!'
if not (c in synset_to_label.keys() + label_to_synset.keys()
Expand All @@ -117,24 +87,21 @@ def _convert_categories(categories):

class ShapeNet_Meshes(data.Dataset):
r"""ShapeNet Dataset class for meshes.
Args:
root (str): Path to the root directory of the ShapeNet dataset.
categories (str): List of categories to load from ShapeNet. This list may
contain synset ids, class label names (for ShapeNetCore classes),
or a combination of both.
train (bool): return the training set else the test set
split (float): amount of dataset that is training out of 1
train (bool): If True, return the training set, otherwise the test set
split (float): fraction of the dataset to be used for training (>=0 and <=1)
no_progress (bool): if True, disables progress bar
Returns:
.. code-block::
dict: {
attributes: {name: str, path: str, synset: str, label: str},
data: {vertices: torch.Tensor, faces: torch.Tensor}
}
dict: {
attributes: {name: str, path: str, synset: str, label: str},
data: {vertices: torch.Tensor, faces: torch.Tensor}
}
Example:
>>> meshes = ShapeNet_Meshes(root='../data/ShapeNet/')
>>> obj = next(iter(meshes))
Expand All @@ -143,7 +110,6 @@ class ShapeNet_Meshes(data.Dataset):
>>> obj['data']['faces'].shape
torch.Size([1910, 3])
"""

def __init__(self, root: str, categories: list = ['chair'], train: bool = True,
split: float = .7, no_progress: bool = False):
self.root = Path(root)
Expand Down Expand Up @@ -193,6 +159,86 @@ def __getitem__(self, index):
return {'data': data, 'attributes': attributes}


class ShapeNet(KaolinDataset):
r"""ShapeNetV1 Dataset class for meshes.
Args:
root (str): path to ShapeNet root directory
categories (list): List of categories to load from ShapeNet. This list may
contain synset ids, class label names (for ShapeNetCore classes),
or a combination of both.
train (bool): If True, return the training set, otherwise the test set
split (float): fraction of the dataset to be used for training (>=0 and <=1)
Returns:
.. code-block::
dict: {
attributes: {name: str, path: str, synset: str, label: str},
data: {vertices: torch.Tensor, faces: torch.Tensor}
}
Example:
>>> meshes = ShapeNet(root='../data/ShapeNet/')
>>> obj = meshes[0]
>>> obj['data'].vertices.shape
torch.Size([2133, 3])
>>> obj['data'].faces.shape
torch.Size([1910, 3])
"""

def initialize(self, root: str, categories: list, train: bool = True, split: float = .7):
"""Initialize the dataset
Args:
root (str): path to ShapeNet root directory
categories (list): List of categories to load from ShapeNet. This list may
contain synset ids, class label names (for ShapeNetCore classes),
or a combination of both.
train (bool): If True, return the training set, otherwise the test set
split (float): fraction of the dataset to be used for training (>=0 and <=1)"""
self.root = Path(root)
self.paths = []
self.synset_idxs = []
self.synsets = _convert_categories(categories)
self.labels = [synset_to_label[s] for s in self.synsets]

# loops through desired classes
for i in range(len(self.synsets)):
syn = self.synsets[i]
class_target = self.root / syn
if not class_target.exists():
raise ValueError('Class {0} ({1}) was not found at location {2}.'.format(
syn, self.labels[i], str(class_target)))

# find all objects in the class
models = sorted(class_target.glob('*'))
stop = int(len(models) * split)
if train:
models = models[:stop]
else:
models = models[stop:]
self.paths += models
self.synset_idxs += [i] * len(models)

self.names = [p.name for p in self.paths]

def __len__(self):
"""Returns the length of the dataset. """
return len(self.paths)

def _get_data(self, index):
synset_idx = self.synset_idxs[index]
obj_location = self.paths[index] / 'model.obj'
mesh = TriangleMesh.from_obj(str(obj_location))
return (mesh,)

def _get_attributes(self, index):
synset_idx = self.synset_idxs[index]
attributes = {
'name': self.names[index],
'path': self.paths[index] / 'model.obj',
'synset': self.synsets[synset_idx],
'label': self.labels[synset_idx]
}
return attributes

class ShapeNet_Images(data.Dataset):
r"""ShapeNet Dataset class for images.
Expand Down
5 changes: 2 additions & 3 deletions kaolin/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from typing import Callable
import numpy as np


def _composedecorator(*decs):
"""Returns a composition of several decorators.
Expand Down Expand Up @@ -193,7 +192,7 @@ def __init__(self, func: Callable, cache_dir: [str, Path], cache_key: str):
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.cached_ids = [p.stem for p in self.cache_dir.glob('*')]

def __call__(self, unique_id: str, **kwargs):
def __call__(self, unique_id: str, *args, **kwargs):
"""Execute self.func if not cached, otherwise, read data from disk.
Args:
Expand All @@ -207,7 +206,7 @@ def __call__(self, unique_id: str, **kwargs):
fpath = self.cache_dir / f'{unique_id}.p'

if not fpath.exists():
output = self.func(**kwargs)
output = self.func(*args, **kwargs)
self._write(output, fpath)
self.cached_ids.append(unique_id)
else:
Expand Down
21 changes: 21 additions & 0 deletions kaolin/rep/Mesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,3 +967,24 @@ def compute_interior_angles_per_edge(self):

def compute_dihedral_angles_per_edge(self):
raise NotImplementedError

def __getstate__(self):
outputs = {'vertices': self.vertices,
'faces': self.faces}
if self.uvs is not None:
outputs['uvs'] = self.uvs
if self.face_textures is not None:
outputs['face_textures'] = self.face_textures
if self.textures is not None:
outputs['textures'] = self.textures
return outputs

def __setstate__(self, args):
self.vertices = args['vertices']
self.faces = args['faces']
if 'uvs' in args:
self.uvs = args['uvs']
if 'face_textures' in args:
self.face_textures = args['face_textures']
if 'textures' in args:
self.textures = args['textures']

0 comments on commit fa17f79

Please sign in to comment.