## General imports

In [77]:
import pandas as pd
import numpy as np
import copy
from copy import deepcopy
from sklearn import preprocessing

## Defining the structural equations of the synthetic model

In [45]:
def generate_synthetic_data(p,n):
    C = np.random.binomial(1,p,n)
    A = [np.random.binomial(1,0.55,1)[0] if c ==0 else np.random.binomial(1,0.75,1)[0] for c in C]
    M = [np.random.choice(3, 1, p=[0.5, 0.4, 0.1])[0] if a==0 else np.random.choice(3, 1, p=[0.35, 0.4, 0.25])[0] for a in A]
    Y = [0.5*c + 0.75*a + 0.6*m + np.random.normal(size=1) [0] for c,a,m in zip(C,A,M)]
    df = pd.DataFrame({'C': C, 'A': A, 'M': M, 'Y': Y})
    return df

In [58]:
df = generate_synthetic_data(0.35,100000)

In [59]:
#scaling the target
df.Y = preprocessing.minmax_scale(df.Y, feature_range=(0, 1), axis=0, copy=True)

## Binarizing the target and generating the different datasets

In [None]:
# setting the three thresholds
th1=df.Y.quantile(0.25)
th2=df.Y.quantile(0.5)
th3=df.Y.quantile(0.75) 
# important note: if we choose a threshold th1-0.1 or th3+0.1 --> fair results --> LDP has No impact on fairness
list_th = [th1,th2,th3]
synthCopy = deepcopy(df)
for th in list_th:
    df['Y'] = np.where(df['Y'] > th, 1, 0)
    df.to_csv('Datasets/Draft/S1_' + str(round(th,2)) + '.csv', index=False) # saving the generated datasets
    df['Y'] = synthCopy['Y']