In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import h5py
import os
from  preprocessor import Preprocessor

In [None]:
def save_data_set(x, y, data_type, path, s=''):
    if not os.path.exists(path):
        os.makedirs(path)
    fname=os.path.join(path, f'x_{data_type}{s}.h5')
    # print("Saving x_{} of shape {} in {}".format(data_type, x.shape, fname))
    xf = h5py.File(fname, 'w')
    xf.create_dataset('x_{}'.format(data_type), data=x)
    xf.close()

    # print("Saving y_{} of shape {} in {}".format(data_type, y.shape, fname))
    yf = h5py.File(os.path.join(path, f'y_{data_type}{s}.h5'), 'w')
    yf.create_dataset(f'y_{data_type}', data=y)
    yf.close()

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
 'marital-status', 'occupation', 'relationship', 'race', 'sex',
  'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label']

df = pd.read_csv("./datasets/adult.data", names=column_names, header=None, index_col=False, engine='python')

df.head()

X = df.drop(['label'], axis=1)
y = df['label'].str.strip().map({'<=50K': 0, '>50K': 1}).astype('int')
X.head()

# Returned values type: 
# x_train - pandas.core.frame.DataFrame 
# y_train - pandas.core.series.Series 
# x_test - pandas.core.frame.DataFrame
# y_test - pandas.core.series.Series
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5, stratify=y)

prep = Preprocessor()
# Returned value type: Numpy NDArray
x_train = prep.fit_transform(x_train)
# Returned value type: Numpy NDArray
x_test = prep.transform(x_test)

# Returned values type: 
# x_test - numpy.ndarray 
# y_test - pandas.core.series.Series 
# x_val - numpy.ndarray
# y_val - pandas.core.series.Series
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=4096, random_state=5, stratify=y_test)

test_idx = y_test[0:15].index
path = "samples//"
if not os.path.exists(path):
    os.makedirs(path)

for i in range(test_idx.size):
    out = f"Sample_Index: {test_idx[i]}\n"
    for col in df.columns:
        out = out + col + " : " + str(df.loc[test_idx[i]][col]) + "\n"
    fname = os.path.join(path, "sample" + str(i) + ".raw")
    # print("Saving raw data to: " + fname)
    with open(fname, 'w') as f:
        f.write(out)
    save_data_set(np.reshape(x_test[i], [1, x_test[i].shape[0]]), y_test.to_numpy()[i], data_type='sample'+str(i), path=path)
