-
Notifications
You must be signed in to change notification settings - Fork 6
/
prepare_dataset.py
112 lines (93 loc) · 3.23 KB
/
prepare_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Generate adult dataset
import os
import json
import numpy as np
import pandas as pd
from prepare_dataset_utils import CATEGORICAL, CONTINUOUS, ORDINAL, verify
output_dir = 'data'
name = "adult"
def project_table(data, meta):
values = np.zeros(shape=data.shape, dtype='float32')
for id_, info in enumerate(meta):
if info['type'] == CONTINUOUS:
values[:, id_] = data.iloc[:, id_].values.astype('float32')
else:
mapper = dict([(item, id) for id, item in enumerate(info['i2s'])])
mapped = data.iloc[:, id_].apply(lambda x: mapper[x]).values
values[:, id_] = mapped
mapped = data.iloc[:, id_].apply(lambda x: mapper[x]).values
return values
def main():
try:
os.mkdir(output_dir)
except:
pass
try:
os.mkdir(temp_dir)
except:
pass
df = pd.read_csv("utils/raw/binary/adult.csv".format(name), dtype='str', delimiter=',', header=None)
df = pd.DataFrame(df)
print(df.shape)
col_type = [
('Age', CONTINUOUS),
('workclass', CATEGORICAL),
('fnlwgt', CONTINUOUS),
('education', CATEGORICAL),
('education-num', CONTINUOUS),
('marital-status', CATEGORICAL),
('occupation', CATEGORICAL),
('relationship', CATEGORICAL),
('race', CATEGORICAL),
('sex', CATEGORICAL),
('capital-gain', CONTINUOUS),
('capital-loss', CONTINUOUS),
('hours-per-week', CONTINUOUS),
('native-country', CATEGORICAL),
('label', CATEGORICAL)
]
print(len(col_type))
for id_ in range(len(col_type)):
df = df[df.iloc[:,id_].values != ' ?']
df=df.replace(' >50K.', ' >50K')
df=df.replace(' <=50K.', ' <=50K')
print(df.shape)
# print(df.columns)
meta = []
for id_, info in enumerate(col_type):
if info[1] == CONTINUOUS:
meta.append({
"name": info[0],
"type": info[1],
"min": np.min(df.iloc[:, id_].values.astype('float')),
"max": np.max(df.iloc[:, id_].values.astype('float'))
})
else:
if info[1] == CATEGORICAL:
value_count = list(dict(df.iloc[:, id_].value_counts()).items())
value_count = sorted(value_count, key=lambda x: -x[1])
mapper = list(map(lambda x: x[0], value_count))
else:
mapper = info[2]
meta.append({
"name": info[0],
"type": info[1],
"size": len(mapper),
"i2s": mapper
})
tdata = project_table(df, meta)
config = {
'columns':meta,
'problem_type':'binary_classification'
}
np.random.seed(0)
np.random.shuffle(tdata)
train_ratio = int(tdata.shape[0]*0.2)
t_train = tdata[:-train_ratio]
t_test = tdata[-train_ratio:]
with open("{}/{}.json".format(output_dir, name), 'w') as f:
json.dump(config, f, sort_keys=True, indent=4, separators=(',', ': '))
np.savez("{}/{}.npz".format(output_dir, name), train=t_train, test=t_test)
verify("{}/{}.npz".format(output_dir, name),
"{}/{}.json".format(output_dir, name))
main()