# Data Generator - by Meike Zehlike

In [1]:
'''
Created on Oct 3, 2017
@author: meike.zehlike
'''
import numpy as np
import pandas as pd
import random, uuid
import itertools


class SyntheticDatasetCreator(object):

    """
    a dataframe that contains protected and non-protected features in columns. Each row represents
    a candidate with their feature values
    """
    @property
    def dataset(self):
        return self.__dataset


    """
    refers to possible combinations of protected attributes. Each group is an element of the Cartesian
    product of the element set per protected attribute.
    example:   attribute gender has two possible elements {0, 1}, attribute ethnicity has three
               possible elements {0, 1, 2} --> there are six groups
               a group is determined by one of the tuples (0, 0), (0,1), (1, 0), ..., (2, 1)
    the non-protected group is always represented by the tuple with only zeros
    """
    @property
    def groups(self):
        return self.__groups


    def __init__(self, size, attributeNamesAndCategories, nonProtectedAttributes):
        """
        TODO: Parameter description
        mu and sigma as parameters
        """
        self.__dataset = pd.DataFrame()

        # determine groups of candidates
        self.__determineGroups(attributeNamesAndCategories)

        # generate distribution of protected attributes
        self.__createCategoricalProtectedAttributes(attributeNamesAndCategories, size)

        # generate scores per group
        self.__createScoresNormalDistribution(nonProtectedAttributes)

        # generate ID column
        # self.__dataset['uuid'] = uuid.uuid4()



    def writeToJSON(self, path):
        self.__dataset.to_json(path, orient='records', lines=True)


    def __determineGroups(self, attributeNamesAndCategories):
        elementSets = []
        for attr, cardinality in attributeNamesAndCategories.items():
            elementSets.append(list(range(0, cardinality)))

        self.__groups = list(itertools.product(*elementSets))


    def __createScoresNormalDistribution(self, nonProtectedAttributes):
        """
        @param nonProtectedAttributes:     a string array that contains the names of the non-protected
                                           features
        @param mu:                         float array that contains means of the expected scores. Its
                                           length should match the length of 'nonProtectedAttributes'
        @param sigma:                      float array that contains standard deviations of the
                                           expected scores. Its length should match the length of
                                           'nonProtectedAttributes'
        """
        # if len(mu_diff) != len(nonProtectedAttributes) or len(sigma_diff) != len(nonProtectedAttributes):
        #    raise ValueError("lengths of arrays nonProtectedAttributes, mu_diff and sigma_diff have to match")

        def score(x, colName):
            mu = np.random.uniform()
            sigma = np.random.uniform()
            x[colName] = np.random.normal(mu, sigma, size=len(x))
            return x

        for attr in nonProtectedAttributes:
            self.__dataset = self.__dataset.groupby(self.__dataset.columns.tolist(), as_index=False,
                                                    sort=False).apply(score, (attr))


    def __createCategoricalProtectedAttributes(self, attributeNamesAndCategories, numItems):
        """
        @param attributeNamesAndCategories:         a dictionary that contains the names of the
                                                    protected attributes as keys and the number of
                                                    categories as values
                                                    (e.g. {('ethnicity'; 5), ('gender'; 2)})
        @param numItems:                            number of items in entire created dataset (all
                                                    protection status)
        @return category zero is assumed to be the non-protected
        """
        newData = pd.DataFrame(columns=attributeNamesAndCategories.keys())

        for attributeName in attributeNamesAndCategories.keys():
            col = []
            categories = range(0, attributeNamesAndCategories[attributeName])
            for count in range(0, numItems):
                col.append(random.choice(categories))
            newData[attributeName] = col

        # add protected columns to dataset
        self.__dataset = self.__dataset.append(newData)

In [10]:
test = SyntheticDatasetCreator(500, {"Group":4}, ["Quality"])
test.writeToJSON('test_data_set_sample_500.json')
print test.dataset

     Group   Quality
0        3 -0.039840
1        0  0.731639
2        3  0.373630
3        3  0.328663
4        2  0.206999
5        3  0.824956
6        0  0.500127
7        0  1.742901
8        0  0.675217
9        3  0.373685
10       1  0.773616
11       1  0.388889
12       0  0.384533
13       1  0.388438
14       0  0.920718
15       3  0.521755
16       2  0.429529
17       1  0.616314
18       1  0.630154
19       2  0.099669
20       2  0.004991
21       0  1.047730
22       3  0.353288
23       3  0.355112
24       2  0.051228
25       3  0.561300
26       2  0.151556
27       1  0.523969
28       1  0.700359
29       3  0.248716
..     ...       ...
470      1  1.057143
471      3  0.709258
472      3  0.618109
473      2  0.413723
474      0  1.851140
475      1  0.685880
476      1  0.845164
477      3  0.187097
478      2  0.269223
479      2  0.320761
480      3  0.221130
481      2  0.011889
482      3  0.192882
483      1  0.986052
484      1  0.630295
485      0  2