Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data and Model wrappers #26

Merged
merged 3 commits into from Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
104 changes: 46 additions & 58 deletions apt/anonymization/anonymizer.py
Expand Up @@ -8,6 +8,7 @@
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE

from typing import Union, Optional

Expand Down Expand Up @@ -49,61 +50,64 @@ def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categoric
self.categorical_features = categorical_features
self.is_regression = is_regression
self.train_only_QI = train_only_QI
self.features_names = None
self.features = None

def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \
-> Union[np.ndarray, pd.DataFrame]:
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
"""
Method for performing model-guided anonymization.

:param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and
categorical data.
:param y: The predictions of the original model on the training data.
:param dataset: Data wrapper containing the training data for the model and the predictions of the
original model on the training data.
:return: An array containing the anonymized training dataset.
"""
if type(x) == np.ndarray:
self.features = [i for i in range(x.shape[1])]
return self._anonymize_ndarray(x.copy(), y)
else: # pandas
self.features = x.columns
if not self.categorical_features:
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
return self._anonymize_pandas(x.copy(), y)

def _anonymize_ndarray(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
x_anonymizer_train = x
if self.train_only_QI:
# build DT just on QI features
x_anonymizer_train = x[:, self.quasi_identifiers]
if x.dtype.kind not in 'iufc':
x_prepared = self._modify_categorical_features(x_anonymizer_train)
if dataset.get_samples().shape[1] != 0:
self.features = [i for i in range(dataset.get_samples().shape[1])]
else:
x_prepared = x_anonymizer_train
if self.is_regression:
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
raise ValueError('No data provided')

if dataset.features_names is not None:
self.features_names = dataset.features_names
else: # if no names provided, use numbers instead
self.features_names = self.features

if not set(self.quasi_identifiers).issubset(set(self.features_names)):
raise ValueError('Quasi identifiers should bs a subset of the supplied features or indexes in range of '
'the data columns')
if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
'the data columns')
self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
if self.categorical_features:
self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]

transformed = self._anonymize(dataset.get_samples().copy(), dataset.get_labels())
if dataset.is_pandas:
return pd.DataFrame(transformed, columns=self.features_names)
else:
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self.anonymizer.fit(x_prepared, y)
cells_by_id = self._calculate_cells(x, x_prepared)
return self._anonymize_data_numpy(x, x_prepared, cells_by_id)
return transformed

def _anonymize_pandas(self, x, y):
def _anonymize(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
x_anonymizer_train = x
if x.dtype.kind not in 'iufc':
if not self.categorical_features:
raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
x_prepared = self._modify_categorical_features(x)
else:
x_prepared = x
x_anonymizer_train = x_prepared
if self.train_only_QI:
# build DT just on QI features
x_anonymizer_train = x.loc[:, self.quasi_identifiers]
# need to one-hot encode before training the decision tree
x_prepared = self._modify_categorical_features(x_anonymizer_train)
x_anonymizer_train = x_prepared[:, self.quasi_identifiers]
if self.is_regression:
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
else:
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self.anonymizer.fit(x_prepared, y)
cells_by_id = self._calculate_cells(x, x_prepared)
return self._anonymize_data_pandas(x, x_prepared, cells_by_id)

self.anonymizer.fit(x_anonymizer_train, y)
cells_by_id = self._calculate_cells(x, x_anonymizer_train)
return self._anonymize_data(x, x_anonymizer_train, cells_by_id)

def _calculate_cells(self, x, x_anonymizer_train):
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
Expand All @@ -130,15 +134,9 @@ def _find_representatives(self, x, x_anonymizer_train, cells):
# get all rows in cell
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
# TODO: should we filter only those with majority label? (using hist)
if type(x) == np.ndarray:
rows = x[indexes]
else: # pandas
rows = x.iloc[indexes]
rows = x[indexes]
for feature in self.quasi_identifiers:
if type(x) == np.ndarray:
values = rows[:, feature]
else: # pandas
values = rows.loc[:, feature]
values = rows[:, feature]
if self.categorical_features and feature in self.categorical_features:
# find most common value
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
Expand All @@ -163,7 +161,7 @@ def _find_sample_cells(self, samples, cells_by_id):
node_ids = self._find_sample_nodes(samples)
return [cells_by_id[node_id] for node_id in node_ids]

def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
def _anonymize_data(self, x, x_anonymizer_train, cells_by_id):
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
index = 0
for row in x:
Expand All @@ -173,22 +171,12 @@ def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
row[feature] = cell['representative'][feature]
return x

def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
index = 0
for i, row in x.iterrows():
cell = cells[index]
index += 1
for feature in cell['representative']:
x.at[i, feature] = cell['representative'][feature]
return x

def _modify_categorical_features(self, x):
# prepare data for DT
used_features = self.features
if self.train_only_QI:
used_features = self.quasi_identifiers
numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features]
numeric_features = [f for f in self.features if f in used_features and f not in self.categorical_features]
categorical_features = [f for f in self.categorical_features if f in used_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
Expand Down