In [2]:
from data_synthesizer.DataDescriber import DataDescriber
from data_synthesizer.DataGenerator import DataGenerator
from data_synthesizer.ModelInspector import ModelInspector
from data_synthesizer.lib.utils import read_json_file, display_bayesian_network
from data_synthesizer.lib.PrivBayes import construct_noisy_conditional_distributions
from pathlib import Path
from sklearn.utils import shuffle

import pandas as pd

description = f'/vol/kiakademie/stream_fairness_exp/adult_from_survey_description_preprocessed.json'
clean_description = f'/vol/kiakademie/stream_fairness_exp/adult_from_survey_description_debiased_preprocessed.json'



num_tuples_to_generate = 50000

from DataManipulator import DataManipulator

In [3]:
describer = DataDescriber()
describer.data_description = read_json_file(description)
bayesian_net = describer.data_description['bayesian_network']

print(bayesian_net)

[['relationship', ['sex']], ['age', ['relationship']], ['hours-per-week', ['sex', 'age', 'relationship']], ['workclass', ['hours-per-week', 'age', 'relationship']], ['occupation', ['hours-per-week', 'age', 'sex', 'workclass']], ['education', ['occupation', 'age', 'sex', 'workclass']], ['native-country', ['education', 'age', 'workclass']], ['marital-status', ['native-country', 'age', 'relationship']], ['race', ['native-country', 'age', 'sex', 'relationship']], ['capital-gain', ['relationship', 'occupation', 'education']], ['capital-loss', ['relationship', 'occupation', 'capital-gain']], ['income', ['relationship', 'occupation', 'capital-gain', 'capital-loss', 'age', 'education']]]


In [4]:
def get_parent_values(description, parent):
    parent_values = {}
    bins = description['attribute_description'][parent]['distribution_bins']
    for i in range(len(bins)):
        parent_values[bins[i]] = i
    
    return parent_values

def get_parent_indices(bn, child):
    parent_indices = {}

    for entry in bn:
        if entry[0] == child:
            i = 0
            for parent in entry[1]:
                parent_indices[parent] = i
                i += 1
    return parent_indices

In [6]:
parent_indices = get_parent_indices(bayesian_net, 'hours-per-week')
print(parent_indices)

{'sex': 0, 'age': 1, 'relationship': 2}


In [7]:
print(get_parent_values(description=describer.data_description, parent='hours-per-week'))

{'40-60': 0, '<40': 1, '>60': 2}


In [12]:
print(get_parent_values(description=describer.data_description, parent='sex'))

{'Female': 0, 'Male': 1}


In [9]:
print(get_parent_values(description=describer.data_description, parent='marital-status'))

{'Divorced': 0, 'Married-AF-spouse': 1, 'Married-civ-spouse': 2, 'Married-spouse-absent': 3, 'Never-married': 4, 'Separated': 5, 'Widowed': 6}


In [10]:
print(get_parent_values(description=describer.data_description, parent='income'))

{0.0: 0, 1.0: 1}


In [38]:
manipulator = DataManipulator()

out_file = f'./data/adult/women_work_full_on_unmodified_scenario_description.json'

#hours-per-week increased for women: 2.0*40-60, 0.0*<40, 1.5*60+ (Note: American workhours are long...)
scenario_women_work_full = {'hours-per-week': {'dist':[2.0, 0.0, 1.5], 'parent_indices': [0], 'parent_values' : {0 : [0]} }} #increase only for sex = Female
changing_attributes = ['hours-per-week']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_women_work_full)

Inducing drift for hours-per-week
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not used for drift because parent values are [0]
0 [0]
1 is not use

In [39]:
manipulator = DataManipulator()

out_file = f'./data/adult/men_work_full_on_unmodified_scenario_description.json'

#hours-per-week increased for women: 2.0*40-60, 0.5*<40, 1.5*60+ (Note: American workhours are long...)
scenario_men_work_full = {'hours-per-week': {'dist':[2.0, 0.5, 1.5], 'parent_indices': [0], 'parent_values' : {0 : [1]} }} #increase only for sex = Male
changing_attributes = ['hours-per-week']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_men_work_full)

Inducing drift for hours-per-week
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]
0 is not used for drift because parent values are [1]
0 [1]


In [None]:
manipulator = DataManipulator()

out_file = f'./data/adult/women_stem_on_unmodified_scenario_description.json'

#Women will be made more likely to be in Armed-Forces, Craft-Repair, Tech-support; less likely in Adm-clerical positions
#note: one distribution bin not listed - no entry?
scenario_women_stem = {'occupation': {'dist':[1.0, 0.25, 1.75, 1.75, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.75, 1.0, 1.0], 'parent_indices': [2], 'parent_values' : {2 : [0]} }} #increase STEM jobs for women
changing_attributes = ['occupation']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_women_stem)

In [16]:
manipulator = DataManipulator()

out_file = f'./data/adult/less_marriage_on_unmodified_scenario_description.json'

#{'Divorced': 0, 'Married-AF-spouse': 1, 'Married-civ-spouse': 2, 'Married-spouse-absent': 3, 'Never-married': 4, 'Separated': 5, 'Widowed': 6}

scenario_less_marriage = {'marital-status': {'dist':[1.5, 0.8, 0.8, 0.8, 1.5, 1.5, 1.0, 1.0]}} #distribution above gets modified as such
changing_attributes = ['marital-status']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_less_marriage, non_disc=True)

Inducing drift for marital-status


In [17]:
manipulator = DataManipulator()

out_file = f'./data/adult/inflation_on_unmodified_scenario_description.json'

scenario_inflation = {'income': {'dist':[0.5, 1.5]}, 'capital-gain':  {'dist':[0.5, 1,5]}, 'capital-loss': {'dist':[0.5, 1.5]}} #generally increase everything relating to value of money - both income and losses
changing_attributes = ['income', 'capital-gain', 'capital-loss']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_inflation, non_disc=True)

Inducing drift for income
Inducing drift for capital-gain
Inducing drift for capital-loss


In [18]:
def generate_arff_head(description):
    for attribute in description['meta']['all_attributes']:
        if not attribute in description['meta']['candidate_keys']:
            bins = description['attribute_description'][attribute]['distribution_bins']

            val_string = "{ "

            for val in bins[:-1]:
                val_string += (str(val) + ", ")

            val_string += (str(bins[-1]) + "}")

            print("@attribute " + attribute + " " + val_string)

In [19]:
generate_arff_head(describer.data_description)

@attribute age { 25-60, <25, >60}
@attribute workclass { ?, Federal-gov, Local-gov, Never-worked, Private, Self-emp-inc, Self-emp-not-inc, State-gov, Without-pay}
@attribute fnlwgt { 12285.0, 86190.75, 160096.5, 234002.25, 307908.0, 381813.75, 455719.5, 529625.25, 603531.0, 677436.75, 751342.5, 825248.25, 899154.0, 973059.75, 1046965.5, 1120871.25, 1194777.0, 1268682.75, 1342588.5, 1416494.25}
@attribute education { 10th, 11th, 12th, 1st-4th, 5th-6th, 7th-8th, 9th, Assoc-acdm, Assoc-voc, Bachelors, Doctorate, HS-grad, Masters, Preschool, Prof-school, Some-college}
@attribute education-num { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
@attribute marital-status { Divorced, Married-AF-spouse, Married-civ-spouse, Married-spouse-absent, Never-married, Separated, Widowed}
@attribute occupation { ?, Adm-clerical, Armed-Forces, Craft-repair, Exec-managerial, Farming-fishing, Handlers-cleaners, Machine-op-inspct, Other-service, Priv-house-serv, Prof-specialty, Protective-serv, Sales,

In [20]:
#transform data for arff format - note: head needs to be once manually perfected with data from above ;)
#also, all the inputs  except for the arff_head are INCOMPLETE paths, as this gets done for multiple repetitions in here (maybe I should do it in an outer loop? Whatever...)

def create_arffs(in_csv, arff_head, temp_middle_csv, out_arff):

    for repetition in range(10):
        df = pd.read_csv(f"{in_csv}{repetition}.csv", index_col=False)
        data = df.to_dict(orient='records')
        new_data = []
        for i in range(len(data)):
            entry = data[i]
            new_entry = {}
            for k in entry.keys():
                if k in ['G1', 'G2', 'G3']:
                    if int(entry[k]) < 10:
                        value = 0
                    elif int(entry[k]) >= 10:
                        value = 1
                else:
                    value = entry[k]
                if value is None or value == '':
                    value = '?'
                new_entry[k] = value
            new_data.append(new_entry)
        new_df = pd.DataFrame(new_data)
        new_df.to_csv(f"{temp_middle_csv}{repetition}.csv", index=False)


        filenames = [f"{arff_head}",f"{temp_middle_csv}{repetition}.csv"]
        with open(f"{out_arff}{repetition}.arff", 'w') as outfile:
            i = 0
            for fname in filenames:
                with open(fname) as infile:
                    j = 0
                    for line in infile:
                        if not j<i:
                            outfile.write(line)
                        j += 1
                i += 1

In [23]:
scenario = 'unmodified'
'''
generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description)
generator.save_synthetic_data(f"./data/adult/{scenario}.csv")
'''


df = pd.read_csv(f'./data/adult/{scenario}.csv', index_col = False)
print(df.columns)
df.drop(["Unnamed: 0", "fnlwgt", "education-num"], axis=1,inplace=True)

for repetition in range(10):
    df = shuffle(df)
    df.dropna(inplace=True)
    df.to_csv(f"./data/adult/{scenario}/run_{repetition}.csv", index=False)


in_csv = f"./data/adult/{scenario}/run_"
arff_head = f"./data/adult/arff_head.arff"
tem_middle_csv = f"./data/adult/{scenario}_preprocessed/run_"
out_arff = f"./data/adult/{scenario}_preprocessed/run_"

create_arffs(in_csv, arff_head=arff_head, temp_middle_csv=tem_middle_csv, out_arff=out_arff)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'age', 'workclass', 'fnlwgt', 'education',
       'education-num', 'marital-status', 'occupation', 'relationship', 'race',
       'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
       'native-country', 'income'],
      dtype='object')


In [41]:

scenario = 'men_work_full'

generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, f'./data/adult/{scenario}_on_unmodified_scenario_description.json')
generator.save_synthetic_data(f"./data/adult/{scenario}.csv")



df = pd.read_csv(f'./data/adult/{scenario}.csv', index_col = False)
print(df.columns)
df.drop(["Unnamed: 0", "fnlwgt", "education-num"], axis=1,inplace=True)

for repetition in range(10):
    df = shuffle(df)
    df.dropna(inplace=True)
    df.to_csv(f"./data/adult/{scenario}/run_{repetition}.csv", index=False)


in_csv = f"./data/adult/{scenario}/run_"
arff_head = f"./data/adult/arff_head.arff"
tem_middle_csv = f"./data/adult/{scenario}_preprocessed/run_"
out_arff = f"./data/adult/{scenario}_preprocessed/run_"

create_arffs(in_csv, arff_head=arff_head, temp_middle_csv=tem_middle_csv, out_arff=out_arff)

Unnamed: 0.1 gets processed
Unnamed: 0 gets processed
age gets processed
workclass gets processed
fnlwgt gets processed
education gets processed
education-num gets processed
marital-status gets processed
occupation gets processed
relationship gets processed
race gets processed
sex gets processed
capital-gain gets processed
capital-loss gets processed
hours-per-week gets processed
native-country gets processed
income gets processed
Index(['Unnamed: 0.1', 'Unnamed: 0', 'age', 'workclass', 'fnlwgt', 'education',
       'education-num', 'marital-status', 'occupation', 'relationship', 'race',
       'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
       'native-country', 'income'],
      dtype='object')


In [21]:
#transform data for arff format - note: head needs to be once manually perfected with data from above ;)
#also, all the inputs  except for the arff_head are INCOMPLETE paths, as this gets done for multiple repetitions in here (maybe I should do it in an outer loop? Whatever...)

#MODIFIED - High class only for 14+ instead of 10+ !!!

def create_arffs_diff_class(in_csv, arff_head, temp_middle_csv, out_arff):

    for repetition in range(10):
        df = pd.read_csv(f"{in_csv}{repetition}.csv", index_col=False)
        data = df.to_dict(orient='records')
        new_data = []
        for i in range(len(data)):
            entry = data[i]
            new_entry = {}
            for k in entry.keys():
                if k in ['G1', 'G2', 'G3']:
                    if int(entry[k]) < 14:
                        value = 0
                    elif int(entry[k]) >= 14:
                        value = 1
                else:
                    value = entry[k]
                if value is None or value == '':
                    value = '?'
                new_entry[k] = value
            new_data.append(new_entry)
        new_df = pd.DataFrame(new_data)
        new_df.to_csv(f"{temp_middle_csv}{repetition}.csv", index=False)


        filenames = [f"{arff_head}",f"{temp_middle_csv}{repetition}.csv"]
        with open(f"{out_arff}{repetition}.arff", 'w') as outfile:
            i = 0
            for fname in filenames:
                with open(fname) as infile:
                    j = 0
                    for line in infile:
                        if not j<i:
                            outfile.write(line)
                        j += 1
                i += 1

In [22]:

scenario = 'grade_category_adjusted'

in_csv = f"./data/student_performance/unmodified/run_"
arff_head = f"./data/student_performance/arff_head.arff"
tem_middle_csv = f"./data/student_performance/{scenario}_preprocessed/run_"
out_arff = f"./data/student_performance/{scenario}_preprocessed/run_"

create_arffs_diff_class(in_csv, arff_head=arff_head, temp_middle_csv=tem_middle_csv, out_arff=out_arff)

In [31]:
def merge_datasets(pre_data_path, post_data_path, name, driftpoint, arff_head):

    for i in range(10):
        pre_drift = pd.read_csv(pre_data_path + "/run_" + str(i) +".csv")
        post_drift = pd.read_csv(post_data_path + "/run_" + str(i) +".csv")

        pre_df = pre_drift.iloc[:driftpoint,:]
        post_df = post_drift.iloc[driftpoint:,:]

        new = pd.concat([pre_df, post_df])
        new.to_csv(name + "/run_" + str(i) + ".csv", index=False)

        filenames = [f"{arff_head}",f"{name}/run_{i}.csv"]
        with open(f"{name}/run_{i}.arff", 'w') as outfile:
            i = 0
            for fname in filenames:
                with open(fname) as infile:
                    j = 0
                    for line in infile:
                        if not j<i:
                            outfile.write(line)
                        j += 1
                i += 1



In [43]:
scenario = 'women_work_full'

arff_head = f"./data/adult/arff_head.arff"

pre = f"./data/adult/unmodified_preprocessed"
post = f"./data/adult/{scenario}_preprocessed"

driftpoint = 25000
name = f"./data/adult/unmodified_to_{scenario}_{driftpoint}"

try:
    Path.mkdir(f"./data/adult/unmodified_to_{scenario}_{driftpoint}")
except Exception as e:
    print("An error occured: " + str(e))

merge_datasets(pre_data_path=pre, post_data_path=post, name=name, driftpoint=driftpoint, arff_head=arff_head)