Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pytorch wrapper #58

Merged
merged 63 commits into from Aug 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
b0c6c4d
Create initial version of wrappers for models (#1)
abigailgold Feb 10, 2022
9f4d649
add generic dataset components (#7)
ron-shm Feb 22, 2022
fd9f134
using dataset wrapper on anonymizer
olasaadi Mar 1, 2022
3d82db8
Remove redundant code.
abigailgold Mar 6, 2022
f2df2fc
Renaming
abigailgold Mar 7, 2022
300e391
fix bug and update test_model
olasaadi Mar 10, 2022
c0dbb20
add dataset wrapper to docstring
olasaadi Mar 10, 2022
1280b84
using dataset wrapper on anonymizer
olasaadi Mar 1, 2022
eefad20
fix bug and update test_model
olasaadi Mar 10, 2022
6afb175
add dataset wrapper to docstring
olasaadi Mar 10, 2022
b4eddab
apply changes after rebase with wrappers
olasaadi Mar 10, 2022
c5d3be2
Merge remote-tracking branch 'origin/dataset_wrapper_anonimizer' into…
olasaadi Mar 10, 2022
a2b5609
apply changes
olasaadi Mar 15, 2022
45cc918
Add more to wrappers
abigailgold Mar 15, 2022
f99bf31
add pytorch Dataset
ron-shm Mar 15, 2022
a9162fb
Add more to wrappers
abigailgold Mar 15, 2022
a432b8f
add pytorch Dataset
ron-shm Mar 15, 2022
7b788b9
using dataset wrapper on anonymizer
olasaadi Mar 1, 2022
3263f92
anonymizer works with numpy and return numpy/pandas as original dataset
olasaadi Mar 19, 2022
8aa7bb8
categorical features and QI passed by indexes
olasaadi Mar 21, 2022
5b34760
fix
olasaadi Mar 21, 2022
3124692
fix docstring and fix assert in test
olasaadi Mar 22, 2022
06158c8
update
olasaadi Mar 23, 2022
137167f
update notebooks
olasaadi Mar 23, 2022
66c86dc
fix notebook and add features_names to ArrayDataset
olasaadi Mar 24, 2022
b54f0a2
fix tests
olasaadi Mar 24, 2022
3bc1341
update docstring
olasaadi Mar 27, 2022
6172385
update docstring
olasaadi Mar 27, 2022
31e278f
fix bug
olasaadi Mar 27, 2022
5f6a258
Merge branch 'wrappers' into dataset_wrapper_anonimizer
olasaadi Mar 28, 2022
8290be0
Merge pull request #15 from HRLDataSecurityAndPrivacy/dataset_wrapper…
olasaadi Mar 28, 2022
6b04fd5
Remove failing assert
abigailgold Apr 5, 2022
ac5d82a
Wrapper minimizer (#20)
olasaadi Apr 18, 2022
fb2413c
Fix boolean property return types
abigailgold Apr 19, 2022
a37ff06
Squashed commit of the following:
abigailgold Apr 25, 2022
b8d2535
Fix handling of categorical features
abigailgold Apr 25, 2022
f484135
add classes and implement some functions
olasaadi May 8, 2022
521c8ce
fix
olasaadi May 17, 2022
7539ca0
save checkpoints
olasaadi May 19, 2022
e0385b0
score
olasaadi May 19, 2022
019f498
fix
olasaadi May 23, 2022
59d8b16
fix
olasaadi May 23, 2022
8459d69
fix
olasaadi May 23, 2022
023f876
update
olasaadi May 30, 2022
8de77f9
update
olasaadi May 30, 2022
a3fb68f
update
olasaadi May 30, 2022
302d0c4
update
olasaadi Jun 2, 2022
c954f53
fix
olasaadi Jun 6, 2022
21cba95
fix
olasaadi Jun 6, 2022
af7d615
fix
olasaadi Jul 4, 2022
07e64b1
fix
olasaadi Jul 4, 2022
4973fbe
fix
olasaadi Jul 19, 2022
3bf26b6
fix
olasaadi Jul 20, 2022
6f69f55
fix bug
olasaadi Jul 20, 2022
c2c7a01
fix bug
olasaadi Jul 20, 2022
65388da
fix docstring
olasaadi Jul 20, 2022
fdc6005
add validation set
olasaadi Jul 21, 2022
c77e34e
update pytorch wrapper to use torch loaders
ron-shm Jul 24, 2022
15d7008
remove self from array2numpy and array2torch_tensor functions
ron-shm Jul 24, 2022
521a2cc
add art to requirements.txt
ron-shm Jul 24, 2022
74ce92a
fix
olasaadi Jul 26, 2022
dc5cc79
Merge with main
abigailgold Aug 1, 2022
64038f7
Merge with main
abigailgold Aug 1, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
116 changes: 57 additions & 59 deletions apt/utils/datasets/datasets.py
Expand Up @@ -19,9 +19,42 @@
logger = logging.getLogger(__name__)


INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, pd.Series, List, Tensor]
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
OUTPUT_DATA_ARRAY_TYPE = np.ndarray
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame, pd.Series]
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]


def array2numpy(arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:

"""
converts from INPUT_DATA_ARRAY_TYPE to numpy array
"""
if type(arr) == np.ndarray:
return arr
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
return arr.to_numpy()
if isinstance(arr, list):
return np.array(arr)
if type(arr) == Tensor:
return arr.detach().cpu().numpy()

raise ValueError("Non supported type: ", type(arr).__name__)


def array2torch_tensor(arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
"""
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
"""
if type(arr) == np.ndarray:
return torch.from_numpy(arr)
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
return torch.from_numpy(arr.to_numpy())
if isinstance(arr, list):
return torch.tensor(arr)
if type(arr) == Tensor:
return arr

raise ValueError("Non supported type: ", type(arr).__name__)


class Dataset(metaclass=ABCMeta):
Expand Down Expand Up @@ -58,46 +91,6 @@ def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
raise NotImplementedError

def _array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Converts from INPUT_DATA_ARRAY_TYPE to numpy array

:param arr: the array to transform
:type arr: numpy array or pandas DataFrame or list or pytorch Tensor
:return: the array transformed into a numpy array
"""
if type(arr) == np.ndarray:
return arr
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return arr.to_numpy()
if isinstance(arr, list):
return np.array(arr)
if type(arr) == Tensor:
return arr.detach().cpu().numpy()

raise ValueError('Non supported type: ', type(arr).__name__)

def _array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
"""
Converts from INPUT_DATA_ARRAY_TYPE to torch tensor array

:param arr: the array to transform
:type arr: numpy array or pandas DataFrame or list or pytorch Tensor
:return: the array transformed into a pytorch Tensor
"""
if type(arr) == np.ndarray:
return torch.from_numpy(arr)
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return torch.from_numpy(arr.to_numpy())
if isinstance(arr, list):
return torch.tensor(arr)
if type(arr) == Tensor:
return arr

raise ValueError('Non supported type: ', type(arr).__name__)


class StoredDataset(Dataset):
"""Abstract Class for a Dataset that can be downloaded from a URL and stored in a file"""
Expand Down Expand Up @@ -146,7 +139,7 @@ def download(url: str, dest_path: str, filename: str, unzip: Optional[bool] = Fa
os.makedirs(dest_path, exist_ok=True)
logger.info("Downloading the dataset...")
urllib.request.urlretrieve(url, file_path)
logger.info('Dataset Downloaded')
logger.info("Dataset Downloaded")

if unzip:
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
Expand Down Expand Up @@ -205,7 +198,7 @@ def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle: Optional
logger.info("Shuffling data")
np.random.shuffle(data)

debug_data = data[:int(len(data) * ratio)]
debug_data = data[: int(len(data) * ratio)]
logger.info(f"Saving {ratio} of the data to {dest_datafile}")
np.savetxt(dest_datafile, debug_data, delimiter=delimiter, fmt=fmt)

Expand All @@ -224,17 +217,19 @@ class ArrayDataset(Dataset):

def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional[list] = None, **kwargs):
self.is_pandas = False
self.is_pandas = self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series

self.features_names = features_names
self._y = self._array2numpy(y) if y is not None else None
self._x = self._array2numpy(x)
self._y = array2numpy(y) if y is not None else None
self._x = array2numpy(x)

if self.is_pandas:
if features_names and not np.array_equal(features_names, x.columns):
raise ValueError("The supplied features are not the same as in the data features")
self.features_names = x.columns.to_list()

if self._y is not None and len(self._x) != len(self._y):
raise ValueError('Non equivalent lengths of x and y')
raise ValueError("Non equivalent lengths of x and y")

def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Expand Down Expand Up @@ -278,9 +273,9 @@ def __init__(self, pred: INPUT_DATA_ARRAY_TYPE, x: Optional[INPUT_DATA_ARRAY_TYP
y: Optional[INPUT_DATA_ARRAY_TYPE] = None, features_names: Optional[list] = None, **kwargs):
self.is_pandas = False
self.features_names = features_names
self._pred = self._array2numpy(pred)
self._y = self._array2numpy(y) if y is not None else None
self._x = self._array2numpy(x) if x is not None else None
self._pred = array2numpy(pred)
self._y = array2numpy(y) if y is not None else None
self._x = array2numpy(x) if x is not None else None
if self.is_pandas and x is not None:
if features_names and not np.array_equal(features_names, x.columns):
raise ValueError("The supplied features are not the same as in the data features")
Expand Down Expand Up @@ -327,14 +322,16 @@ class PytorchData(Dataset):
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
"""
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
self.is_pandas = False
self._y = self._array2torch_tensor(y) if y is not None else None
self._x = self._array2torch_tensor(x)
self._y = array2torch_tensor(y) if y is not None else None
self._x = array2torch_tensor(x)

self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series

if self.is_pandas:
self.features_names = x.columns

if self._y is not None and len(self._x) != len(self._y):
raise ValueError('Non equivalent lengths of x and y')
raise ValueError("Non equivalent lengths of x and y")

if self._y is not None:
self.__getitem__ = self.get_item
Expand All @@ -347,15 +344,15 @@ def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:

:return: samples as numpy array
"""
return self._array2numpy(self._x)
return array2numpy(self._x)

def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get labels.

:return: labels as numpy array
"""
return self._array2numpy(self._y) if self._y is not None else None
return array2numpy(self._y) if self._y is not None else None

def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Expand Down Expand Up @@ -392,6 +389,7 @@ def __len__(self):

class DatasetFactory:
"""Factory class for dataset creation"""

registry = {}

@classmethod
Expand All @@ -406,7 +404,7 @@ def register(cls, name: str) -> Callable:

def inner_wrapper(wrapped_class: Type[Dataset]) -> Any:
if name in cls.registry:
logger.warning('Dataset %s already exists. Will replace it', name)
logger.warning("Dataset %s already exists. Will replace it", name)
cls.registry[name] = wrapped_class
return wrapped_class

Expand All @@ -428,7 +426,7 @@ def create_dataset(cls, name: str, **kwargs) -> Dataset:
:return: An instance of the dataset that is created.
"""
if name not in cls.registry:
msg = f'Dataset {name} does not exist in the registry'
msg = f"Dataset {name} does not exist in the registry"
logger.error(msg)
raise ValueError(msg)

Expand Down