### Skeleton Code for Semi-Synthetic datasets in OpenML 
### Example: https://www.openml.org/d/21

In [8]:
import pickle
import gzip
import numpy as np
import scipy as sp
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd

In [9]:
# Summary of Dataset Information
data_info = {'ID': [21, 23, 26, 31, 50, 151, 155, 183, 184, 292, 333, 334, 335, 351, 354, 375,
            469, 475, 679, 720, 741, 825, 826, 872, 881, 915, 923, 934, 959, 983, 991, 
            1014, 1169, 1216, 1217, 1218, 1235, 1236, 1237, 1470, 1481, 1483, 1498, 
            1557, 1568, 4135, 4552], 
            "sensitive attribute": ['buying', 'Wifes_education', 'parents', 'checking_status', 'top-left-square', 'day', 's1', 'Sex', 'white_king_row', 'Y',
                                  'class', 'class', 'class', 'Y', 'Y', 'speaker', 'DMFT.Begin', 'Time_of_survey', 'sleep_state', 'Sex',
                                  'sleep_state', 'RAD', 'Occasion', 'RAD', 'x3', 'SMOKSTAT', 'isns', 'family_structure', 'parents', 'Wifes_education',
                                  'buying', 'DMFT.Begin', 'Airline', 'click', 'click', 'click', 'elevel', 'size', 'size', 'V2',
                                  'V3', 'V1', 'V5', 'V1', 'V1', 'RESOURCE', 'V1'], 
            "label": ['class', 'Contraceptive_method_used', 'class', 'class', 'Class', 'class', 'class', 'Class_number_of_rings', 'game', 'X1',
                                  'attr1', 'attr1', 'attr1', 'X1', 'X1', 'utterance', 'Prevention', 'Political_system', 'temperature', 'binaryClass',
                                  'binaryClass', 'binaryClass', 'binaryClass', 'binaryClass', 'binaryClass', 'binaryClass', 'binaryClass', 'binaryClass', 'binaryClass', 'binaryClass',
                                  'binaryClass', 'binaryClass', 'Delay', 'impression', 'impression', 'impression', 'class', 'class', 'class', 'Class',
                                  'Class', 'Class', 'Class', 'Class', 'Class', 'target', 'V17'],
            "link": ["https://www.openml.org/data/get_csv/" + string for string in ["21/dataset_21_car.arff", "23/dataset_23_cmc.arff", "26/dataset_26_nursery.arff", "31/dataset_31_credit-g.arff", "50/dataset_50_tic-tac-toe.arff", "2419/electricity-normalized.arff", "2423/pokerhand-normalized.arff", "3620/dataset_187_abalone.arff", "3621/dataset_188_kropt.arff", "49822/australian.arff", 
                                                                             "52236/phpAyyBys", "52237/php4fATLZ", "52238/phphZierv", "52254/php89ntbG", "52257/phpQfR7GF", "52415/JapaneseVowels.arff", "52581/analcatdata_dmft.arff", "52587/analcatdata_germangss.arff", "52979/rmftsa_sleepdata.arff", "53254/abalone.arff",
                                                                             "53275/rmftsa_sleepdata.arff", "53359/boston_corrected.arff", "53360/sensory.arff", "53406/boston.arff", "53415/mv.arff", "53449/plasma_retinol.arff", "53457/visualizing_soil.arff", "53468/socmob.arff", "53493/nursery.arff", "53517/cmc.arff",
                                                                             "53525/car.arff", "53548/analcatdata_dmft.arff", "66526/phpvcoG8S", "183030/phppCF8Zy", "183039/phpLV1N3m", "183150/phpqZOQcc", "520800/Agrawal1.arff", "520801/Stagger1.arff", "520802/Stagger2.arff", "1586239/phpce61nO",
                                                                             "1590570/php7zhUPY", "1590940/phpH4DHsK", "1592290/phpgNaXZe", "1593753/phpfUae7X", "1675984/phpfrJpBS", "1681098/phpmPOD5A", "1798821/php0mZlkF"]]
            }

In [10]:
def preprocessing(df, S_name, Y_name):
    """
    convert categorical features/sentive attribute/label to binary
    """
    df_new = df.copy()
    
    # feature names
    feat_name = df_new.columns.values.tolist()
    
    for feat in feat_name:
        feat_type = df_new[feat].dtype
        # if categorical features or sensitive attribute or label,
        # convert to binary feature.
        if feat_type == "object" or feat == S_name or feat == Y_name:
            feat_count = df_new[feat].value_counts()
            if len(feat_count.index[1:]) == 1:
                df_new[feat] = df_new[feat].replace(feat_count.index[1], np.nan)
                df_new[feat] = df_new[feat].notnull() * 1
            else:
                df_new[feat] = df_new[feat].replace(feat_count.index[1:], np.nan)
                df_new[feat] = df_new[feat].notnull() * 1
    
    """
    split dataset based upon sensitive attribute
    """
    # split dataset
    df_0 = df_new[df_new[S_name] == 0]
    df_1 = df_new[df_new[S_name] == 1]

    # get label
    Y_0 = df_0[Y_name].values
    Y_1 = df_1[Y_name].values

    # get features
    X_0 = df_0.drop([S_name, Y_name], axis=1).values
    X_1 = df_1.drop([S_name, Y_name], axis=1).values
    
    return X_0, X_1, Y_0, Y_1

In [11]:
num_datasets = len(data_info['ID'])
datasets = pd.DataFrame(data = data_info)

dataset_dict = {}

for i in tqdm(range(num_datasets)):
    # read csv file
    df = pd.io.parsers.read_csv(datasets["link"][i])
    # dataset ID
    ID = datasets["ID"][i]
    # sensitive attribute
    S_name = datasets["sensitive attribute"][i]
    # label
    Y_name = datasets["label"][i]
    
    
    X0, X1, Y0, Y1 = preprocessing(df, S_name, Y_name)
    
    data = {"X0": X0, "X1": X1, "Y0": Y0, "Y1": Y1}
    dataset_dict[i] = data

100%|██████████| 47/47 [04:00<00:00,  2.50s/it]


In [14]:
pickle_save_file = 'datasets_parsed.pickle'
f = open(pickle_save_file, 'wb')
pickle.dump(dataset_dict, f, 2)
f.close()

In [15]:
for i in [26, 37]:
    print(data_info['ID'][i], data_info["sensitive attribute"][i], data_info["label"][i])

923 isns binaryClass
1236 size class
