In [1]:
from data_synthesizer.DataDescriber import DataDescriber
from data_synthesizer.DataGenerator import DataGenerator
from data_synthesizer.ModelInspector import ModelInspector
from data_synthesizer.lib.utils import read_json_file, display_bayesian_network
from data_synthesizer.lib.PrivBayes import construct_noisy_conditional_distributions
from pathlib import Path
from sklearn.utils import shuffle

import pandas as pd

description = f'/vol/kiakademie/stream_fairness_exp/student_performance_from_survey_description_full_por.json'
clean_description = f'/vol/kiakademie/stream_fairness_exp/student_performance_from_survey_description_full_pordebiased.json'



num_tuples_to_generate = 5000

from DataManipulator import DataManipulator

In [2]:
describer = DataDescriber()
describer.data_description = read_json_file(description)
bayesian_net = describer.data_description['bayesian_network']

print(bayesian_net)

[['sex', ['activities', 'famsup']], ['schoolsup', ['sex']], ['school', ['schoolsup']], ['G1', ['school', 'sex']], ['address', ['school']], ['reason', ['school']], ['failures', ['G1']], ['G2', ['G1']], ['G3', ['G1', 'G2']], ['age', ['failures']], ['higher', ['G1', 'age']], ['guardian', ['age']], ['romantic', ['age']], ['Pstatus', ['guardian']], ['famsize', ['Pstatus']], ['nursery', ['famsize']], ['traveltime', ['address']], ['Mjob', ['internet']], ['Medu', ['Mjob']], ['Fedu', ['Medu']], ['Fjob', ['Fedu']], ['internet', ['school', 'address']]]


In [3]:
def get_parent_values(description, parent):
    parent_values = {}
    bins = description['attribute_description'][parent]['distribution_bins']
    for i in range(len(bins)):
        parent_values[bins[i]] = i
    
    return parent_values

def get_parent_indices(bn, child):
    parent_indices = {}

    for entry in bn:
        if entry[0] == child:
            i = 0
            for parent in entry[1]:
                parent_indices[parent] = i
                i += 1
    return parent_indices

In [4]:
parent_indices = get_parent_indices(bayesian_net, 'schoolsup')
print(parent_indices)

{'sex': 0}


In [None]:
print(get_parent_values(description=describer.data_description, parent='schoolsup'))

{'no': 0, 'yes': 1}


In [5]:
print(get_parent_values(description=describer.data_description, parent='sex'))

{'F': 0, 'M': 1}


In [28]:
print(get_parent_values(description=describer.data_description, parent='internet'))

{'no': 0, 'yes': 1}


In [13]:
print(get_parent_values(description=describer.data_description, parent='G1'))

{0: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 13, 17: 14, 18: 15, 19: 16}


In [None]:
manipulator = DataManipulator()

out_file = f'./data/student_performance/student_grade_inflation_on_unmodified_scenario_description.json'

#probability *0.5 for grades 0-7, prob *1.5 for grades 14-19
scenario_grade_inflation = {'G2': {'dist':[0.5, 0.5, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.5, 1.5 ,1.5, 1.5, 1.5, 1.5, 1.0]}, 
                            'G1': {'dist':[0.5, 0.5, 0.5, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.5, 1.5 ,1.5, 1.5, 1.5, 1.5, 1.0]},} #increase probability of good grades acorss the board for G2 and G1
changing_attributes = ['G1', 'G2']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_grade_inflation, non_disc=True)

Inducing drift for G1
Inducing drift for G2


In [20]:
manipulator = DataManipulator()

out_file = f'./data/student_performance/student_males_more_support_on_unmodified_scenario_description.json'

scenario_male_support = {'schoolsup': {'dist':[0.0, 2.0], 'parent_indices': [0], 'parent_values' : {0 : [1]} }} #increase schoolsupport for male students
changing_attributes = ['schoolsup']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_male_support)

Inducing drift for schoolsup
0 [1]
0 is not used for drift because parent values are [1]
0 [1]


In [26]:
manipulator = DataManipulator()

out_file = f'./data/student_performance/student_females_more_support_on_unmodified_scenario_description.json'

scenario_female_support = {'schoolsup': {'dist':[0.0, 2.0], 'parent_indices': [0], 'parent_values' : {0 : [0]} }} #increase schoolsupport for female students
changing_attributes = ['schoolsup']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_female_support)

Inducing drift for schoolsup
0 [0]
0 [0]
1 is not used for drift because parent values are [0]


In [4]:
manipulator = DataManipulator()

out_file = f'./data/student_performance/student_internet_era_on_unmodified_scenario_description.json'

scenario_internet_era = {'internet': {'dist':[0.0, 1.0]}} #everyone gets internet
changing_attributes = ['internet']

manipulator.induce_data_drift(description_file=description, out_file=out_file, changing_attributes=changing_attributes, add_dist=scenario_internet_era, non_disc=True)

Inducing drift for internet


In [15]:
def generate_arff_head(description):
    for attribute in description['meta']['all_attributes']:
        if not attribute in description['meta']['candidate_keys']:
            bins = description['attribute_description'][attribute]['distribution_bins']

            val_string = "{ "

            for val in bins[:-1]:
                val_string += (str(val) + ", ")

            val_string += (str(bins[-1]) + "}")

            print("@attribute " + attribute + " " + val_string)

In [16]:
generate_arff_head(describer.data_description)

@attribute school { GP, MS}
@attribute sex { F, M}
@attribute age { 15, 16, 17, 18, 19, 20, 21, 22}
@attribute address { R, U}
@attribute famsize { GT3, LE3}
@attribute Pstatus { A, T}
@attribute Medu { 0, 1, 2, 3, 4}
@attribute Fedu { 0, 1, 2, 3, 4}
@attribute Mjob { at_home, health, other, services, teacher}
@attribute Fjob { at_home, health, other, services, teacher}
@attribute reason { course, home, other, reputation}
@attribute guardian { father, mother, other}
@attribute traveltime { 1, 2, 3, 4}
@attribute studytime { 1, 2, 3, 4}
@attribute failures { 0, 1, 2, 3}
@attribute schoolsup { no, yes}
@attribute famsup { no, yes}
@attribute paid { no, yes}
@attribute activities { no, yes}
@attribute nursery { no, yes}
@attribute higher { no, yes}
@attribute internet { no, yes}
@attribute romantic { no, yes}
@attribute famrel { 1, 2, 3, 4, 5}
@attribute freetime { 1, 2, 3, 4, 5}
@attribute goout { 1, 2, 3, 4, 5}
@attribute Dalc { 1, 2, 3, 4, 5}
@attribute Walc { 1, 2, 3, 4, 5}
@attribute

In [7]:
#transform data for arff format - note: head needs to be once manually perfected with data from above ;)
#also, all the inputs  except for the arff_head are INCOMPLETE paths, as this gets done for multiple repetitions in here (maybe I should do it in an outer loop? Whatever...)

def create_arffs(in_csv, arff_head, temp_middle_csv, out_arff):

    for repetition in range(10):
        df = pd.read_csv(f"{in_csv}{repetition}.csv", index_col=False)
        data = df.to_dict(orient='records')
        new_data = []
        for i in range(len(data)):
            entry = data[i]
            new_entry = {}
            for k in entry.keys():
                if k in ['G1', 'G2', 'G3']:
                    if int(entry[k]) < 10:
                        value = 0
                    elif int(entry[k]) >= 10:
                        value = 1
                else:
                    value = entry[k]
                if value is None or value == '':
                    value = '?'
                new_entry[k] = value
            new_data.append(new_entry)
        new_df = pd.DataFrame(new_data)
        new_df.to_csv(f"{temp_middle_csv}{repetition}.csv", index=False)


        filenames = [f"{arff_head}",f"{temp_middle_csv}{repetition}.csv"]
        with open(f"{out_arff}{repetition}.arff", 'w') as outfile:
            i = 0
            for fname in filenames:
                with open(fname) as infile:
                    j = 0
                    for line in infile:
                        if not j<i:
                            outfile.write(line)
                        j += 1
                i += 1

In [33]:

scenario = 'grade_inflation'
generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, f'./data/student_performance/student_{scenario}_on_unmodified_scenario_description.json')
generator.save_synthetic_data(f"./data/student_performance/{scenario}.csv")


df = pd.read_csv(f'./data/student_performance/{scenario}.csv', index_col = False)
print(df.columns)
df.drop(["Unnamed: 0", "G1", "G2"], axis=1,inplace=True)

for repetition in range(10):
    df = shuffle(df)
    df.dropna(inplace=True)
    df.to_csv(f"./data/student_performance/{scenario}/run_{repetition}.csv", index=False)


in_csv = f"./data/student_performance/{scenario}/run_"
arff_head = f"./data/student_performance/arff_head.arff"
tem_middle_csv = f"./data/student_performance/{scenario}_preprocessed/run_"
out_arff = f"./data/student_performance/{scenario}_preprocessed/run_"

create_arffs(in_csv, arff_head=arff_head, temp_middle_csv=tem_middle_csv, out_arff=out_arff)

Unnamed: 0 gets processed
school gets processed
sex gets processed
age gets processed
address gets processed
famsize gets processed
Pstatus gets processed
Medu gets processed
Fedu gets processed
Mjob gets processed
Fjob gets processed
reason gets processed
guardian gets processed
traveltime gets processed
studytime gets processed
failures gets processed
schoolsup gets processed
famsup gets processed
paid gets processed
activities gets processed
nursery gets processed
higher gets processed
internet gets processed
romantic gets processed
famrel gets processed
freetime gets processed
goout gets processed
Dalc gets processed
Walc gets processed
health gets processed
absences gets processed
G1 gets processed
G2 gets processed
G3 gets processed
Index(['Unnamed: 0', 'school', 'sex', 'age', 'address', 'famsize', 'Pstatus',
       'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime',
       'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'hi

In [21]:
#transform data for arff format - note: head needs to be once manually perfected with data from above ;)
#also, all the inputs  except for the arff_head are INCOMPLETE paths, as this gets done for multiple repetitions in here (maybe I should do it in an outer loop? Whatever...)

#MODIFIED - High class only for 14+ instead of 10+ !!!

def create_arffs_diff_class(in_csv, arff_head, temp_middle_csv, out_arff):

    for repetition in range(10):
        df = pd.read_csv(f"{in_csv}{repetition}.csv", index_col=False)
        data = df.to_dict(orient='records')
        new_data = []
        for i in range(len(data)):
            entry = data[i]
            new_entry = {}
            for k in entry.keys():
                if k in ['G1', 'G2', 'G3']:
                    if int(entry[k]) < 14:
                        value = 0
                    elif int(entry[k]) >= 14:
                        value = 1
                else:
                    value = entry[k]
                if value is None or value == '':
                    value = '?'
                new_entry[k] = value
            new_data.append(new_entry)
        new_df = pd.DataFrame(new_data)
        new_df.to_csv(f"{temp_middle_csv}{repetition}.csv", index=False)


        filenames = [f"{arff_head}",f"{temp_middle_csv}{repetition}.csv"]
        with open(f"{out_arff}{repetition}.arff", 'w') as outfile:
            i = 0
            for fname in filenames:
                with open(fname) as infile:
                    j = 0
                    for line in infile:
                        if not j<i:
                            outfile.write(line)
                        j += 1
                i += 1

In [22]:

scenario = 'grade_category_adjusted'

in_csv = f"./data/student_performance/unmodified/run_"
arff_head = f"./data/student_performance/arff_head.arff"
tem_middle_csv = f"./data/student_performance/{scenario}_preprocessed/run_"
out_arff = f"./data/student_performance/{scenario}_preprocessed/run_"

create_arffs_diff_class(in_csv, arff_head=arff_head, temp_middle_csv=tem_middle_csv, out_arff=out_arff)

In [None]:
def merge_datasets(pre_data_path, post_data_path, name, driftpoint, arff_head):

    for i in range(10):
        pre_drift = pd.read_csv(pre_data_path + "/run_" + str(i) +".csv")
        post_drift = pd.read_csv(post_data_path + "/run_" + str(i) +".csv")

        pre_df = pre_drift.iloc[:driftpoint,:]
        post_df = post_drift.iloc[driftpoint:,:]

        new = pd.concat([pre_df, post_df])
        new.to_csv(name + "/run_" + str(i) + ".csv", index=False)

        filenames = [f"{arff_head}",f"{name}/run_{i}.csv"]
        with open(f"{name}/run_{i}.arff", 'w') as outfile:
            i = 0
            for fname in filenames:
                with open(fname) as infile:
                    j = 0
                    for line in infile:
                        if not j<i:
                            outfile.write(line)
                        j += 1
                i += 1



In [32]:
scenario = 'grade_inflation'

arff_head = f"./data/student_performance/arff_head.arff"

pre = f"./data/student_performance/unmodified"
post = f"./data/student_performance/{scenario}_preprocessed"

driftpoint = 2000
name = f"./data/student_performance/unmodified_to_{scenario}_{driftpoint}"

try:
    Path.mkdir(f"./data/student_performance/unmodified_to_{scenario}_{driftpoint}")
except Exception as e:
    print("An error occured: " + str(e))

merge_datasets(pre_data_path=pre, post_data_path=post, name=name, driftpoint=driftpoint, arff_head=arff_head)