In [13]:
import os
import json
import numpy as np
import pandas as pd

CATEGORICAL = "categorical"
CONTINUOUS = "continuous"
ORDINAL = "ordinal"

In [18]:
!wget https://archive.ics.uci.edu/static/public/2/adult.zip
!unzip adult.zip 

--2023-06-28 13:23:48--  https://archive.ics.uci.edu/static/public/2/adult.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘adult.zip.1’

adult.zip.1             [   <=>              ] 605.70K   800KB/s    in 0.8s    

2023-06-28 13:23:50 (800 KB/s) - ‘adult.zip.1’ saved [620237]

Archive:  adult.zip
  inflating: Index                   
  inflating: adult.data              
  inflating: adult.names             
  inflating: adult.test              
  inflating: old.adult.names         


In [19]:

def verify_table(table, meta):
    for _id, item in enumerate(meta['columns']):
        if item['type'] == CONTINUOUS:
            assert np.all(item['min'] <= table[:, _id])
            assert np.all(table[:, _id] <= item['max'])
        else:
            assert np.all(table[:, _id].astype('int32') >= 0)
            assert np.all(table[:, _id].astype('int32') < item['size'])

def verify(datafile, metafile):
    with open(metafile) as f:
        meta = json.load(f)

    for item in meta['columns']:
        assert 'name' in item
        assert item['name'] is None or type(item['name']) == str

        assert 'type' in item
        assert item['type'] in [CATEGORICAL, CONTINUOUS, ORDINAL]

        if item['type'] == CONTINUOUS:
            assert 'min' in item and 'max' in item
        else:
            assert 'size' in item and 'i2s' in item
            assert item['size'] == len(item['i2s'])
            for ss in item['i2s']:
                assert type(ss) == str
                assert len(set(item['i2s'])) == item['size']


    data = np.load(datafile)

    verify_table(data['train'], meta)
    verify_table(data['test'], meta)

In [20]:

def project_table(data, meta):
    values = np.zeros(shape=data.shape, dtype='float32')

    for id_, info in enumerate(meta):
        if info['type'] == CONTINUOUS:
            values[:, id_] = data.iloc[:, id_].values.astype('float32')
        else:
            mapper = dict([(item, id) for id, item in enumerate(info['i2s'])])
            mapped = data.iloc[:, id_].apply(lambda x: mapper[x]).values
            values[:, id_] = mapped
            mapped = data.iloc[:, id_].apply(lambda x: mapper[x]).values
    return values

In [22]:

train = pd.read_csv("adult.data", dtype='str', delimiter=',', header=None)
test = pd.read_csv("adult.test", dtype='str', delimiter=',', header=None)
data = pd.concat([train, test], axis=0)

In [23]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [24]:
col_type = [
    ('Age', CONTINUOUS),
    ('workclass', CATEGORICAL),
    ('fnlwgt', CONTINUOUS),
    ('education', CATEGORICAL),
    ('education-num', CONTINUOUS),
    ('marital-status', CATEGORICAL),
    ('occupation', CATEGORICAL),
    ('relationship', CATEGORICAL),
    ('race', CATEGORICAL),
    ('sex', CATEGORICAL),
    ('capital-gain', CONTINUOUS),
    ('capital-loss', CONTINUOUS),
    ('hours-per-week', CONTINUOUS),
    ('native-country', CATEGORICAL),
    ('label', CATEGORICAL)
]

In [25]:
for id_ in range(data.shape[-1]):
    data = data[data.iloc[:,id_].values != ' ?']
data=data.replace(' >50K.', ' >50K')
data=data.replace(' <=50K.', ' <=50K') 

In [26]:

meta = []
for id_, info in enumerate(col_type):
    if info[1] == CONTINUOUS:
        meta.append({
            "name": info[0],
            "type": info[1],
            "min": np.min(data.iloc[:, id_].values.astype('float')),
            "max": np.max(data.iloc[:, id_].values.astype('float'))
        })
    else:
        if info[1] == CATEGORICAL:
            value_count = list(dict(data.iloc[:, id_].value_counts()).items())
            value_count = sorted(value_count, key=lambda x: -x[1])
            mapper = list(map(lambda x: x[0], value_count))
        else:
            mapper = info[2]

        meta.append({
            "name": info[0],
            "type": info[1],
            "size": len(mapper),
            "i2s": mapper
        })


In [27]:
meta[0]

{'name': 'Age', 'type': 'continuous', 'min': 17.0, 'max': 90.0}

In [28]:
tdata = project_table(data, meta) # adjust data types

config = {
            'columns':meta, 
            'problem_type':'binary_classification'
        }

np.random.seed(0)
np.random.shuffle(tdata)

train_ratio = int(tdata.shape[0]*0.2)
t_train = tdata[:-train_ratio]
t_test = tdata[-train_ratio:]


os.makedirs("data", exist_ok=True) 

with open(f"data/adult.json", 'w') as f:
    json.dump(config, f, sort_keys=True, indent=4, separators=(',', ': '))
np.savez("data/adult.npz", train=t_train, test=t_test)

verify("data/adult.npz",  "data/adult.json")
