In [1]:
# h5py 안 될 때 (M1 Mac)
#!brew reinstall hdf5
#!export CPATH="/opt/homebrew/include/"
#!export HDF5_DIR=/opt/homebrew/
#!python3 -m pip install h5py

In [1]:
import os
import csv
import pickle

import numpy as np
import h5py

In [2]:
path_root = '../'
path_container = './Container/'

In [3]:
path_train = os.path.join(path_root, "train.csv")
path_valid_class_que = os.path.join(path_root, "validation_classification_question.csv")
path_valid_class_ans = os.path.join(path_root, "validation_classification_answer.csv")
path_valid_compl_que = os.path.join(path_root, "validation_completion_question.csv")
path_valid_compl_ans = os.path.join(path_root, "validation_completion_answer.csv")
path_test_class_que = os.path.join(path_root, "test_classification_question.csv")
path_test_compl_que = os.path.join(path_root, "test_completion_question.csv")
path_ingredient_name = os.path.join(path_root, "node_ingredient.txt")

### Data 읽어서 list로 일단 저장

In [4]:
def read_train_data(path):
    """ Read train.csv and Return lists of data[int] / label[str]. """
    data = []
    labels = []
    with open(path, 'r') as f:
        for line in csv.reader(f):
            recipe = sorted(map(int, line[:-1]))  # a sorted list of recipe (integer)
            cuisine = line[-1]                    # which country? (string)
            data.append(recipe)
            labels.append(cuisine)
    return data, labels


def read_classification_data(question_path, answer_path=None):
    """ Read valid/test data for classification. Then return lists of data[int] / label[str]. """
    data = []
    labels = [] if answer_path is not None else None
    with open(question_path, 'r') as f:
        for line in csv.reader(f):
            recipe = sorted(map(int, line))  # a sorted list of recipe (integer)
            data.append(recipe)
    if answer_path is not None:
        with open(answer_path, 'r') as f:
            for line in csv.reader(f):
                cuisine = line[0]            # which country? (string)
                labels.append(cuisine)
    return data, labels


def read_completion_data(question_path, answer_path=None):
    """ Read valid/test data for completion. Then return lists of data[int] / label[str]. """
    data = []
    labels = [] if answer_path is not None else None
    with open(question_path, 'r') as f:
        for line in csv.reader(f):
            recipe = sorted(map(int, line))                # recipe without an ingredient
            data.append(recipe)
    if answer_path is not None:
        with open(answer_path, 'r') as f:
            for line in csv.reader(f):
                recipe = set(map(int, line))               # original recipe
                missing = recipe - set(data[len(labels)])  # missing ingredient in data
                labels.append(list(missing)[0])
    return data, labels


def read_ingredient_names(ingredient_path):
    ingredients_names = []
    with open(ingredient_path, 'r') as f:
        for line in csv.reader(f):
            ingredients_names.append(line[0])
    return ingredients_names

In [42]:
data_train, labels_train = read_train_data(path_train)
data_valid_class, labels_valid_class = read_classification_data(path_valid_class_que, path_valid_class_ans)
data_valid_compl, labels_valid_compl = read_completion_data(path_valid_compl_que, path_valid_compl_ans)
data_test_class, _ = read_classification_data(path_test_class_que, None)
data_test_compl, _ = read_completion_data(path_test_compl_que, None)
ingredient_names = read_ingredient_names(path_ingredient_name)  # 재료 이름 (string)
cuisine_names = sorted(set(labels_train+labels_valid_class))    # Cuisine 이름 (string), 알파벳 순

### List (```ingredient_names```, ```cuisine_names```) 를 Dictionary로 변환

In [43]:
# Dict: id (int, 0~19) -> cuisine name (str, 알파벳 순)
id_cuisine_dict = dict(zip(range(len(cuisine_names)), cuisine_names))
print(id_cuisine_dict)

{0: 'brazilian', 1: 'british', 2: 'cajun_creole', 3: 'chinese', 4: 'filipino', 5: 'french', 6: 'greek', 7: 'indian', 8: 'irish', 9: 'italian', 10: 'jamaican', 11: 'japanese', 12: 'korean', 13: 'mexican', 14: 'moroccan', 15: 'russian', 16: 'southern_us', 17: 'spanish', 18: 'thai', 19: 'vietnamese'}


In [44]:
# Dict: cuisine name (str) -> id (int)
cuisine_id_dict = {b: a for a, b in id_cuisine_dict.items()}
print(cuisine_id_dict)

{'brazilian': 0, 'british': 1, 'cajun_creole': 2, 'chinese': 3, 'filipino': 4, 'french': 5, 'greek': 6, 'indian': 7, 'irish': 8, 'italian': 9, 'jamaican': 10, 'japanese': 11, 'korean': 12, 'mexican': 13, 'moroccan': 14, 'russian': 15, 'southern_us': 16, 'spanish': 17, 'thai': 18, 'vietnamese': 19}


In [45]:
# Dict: id (int, 0~6713) -> ingredient name (str, node_ingredient.txt 기준)
id_ingredient_dict = dict(zip(range(len(ingredient_names)), ingredient_names))
print(len(id_ingredient_dict.items()))
print(dict(list(id_ingredient_dict.items())[:20]), '...')  # 20개만 출력

6714
{0: 'coca-cola', 1: 'vegan butter', 2: 'sourdough rolls', 3: 'reduced sodium refried beans', 4: 'ramen noodles', 5: 'crumbled corn bread', 6: 'japanese breadcrumbs', 7: 'toasted shredded coconut', 8: 'chinese spinach', 9: "Hellmann's® Real Mayonnaise", 10: 'ducklings', 11: 'basil olive oil', 12: 'white baking bar', 13: 'rye whiskey', 14: 'mushroom broth', 15: 'meat loaf mix', 16: 'cocktail sauce', 17: 'asparagus spears', 18: 'nonfat greek yogurt', 19: 'cabernet sauvignon'} ...


In [46]:
# Dict: ingredient name (str) -> id (int, 0~6713)
ingredient_id_dict = dict()
duplicate_ingredient_names = set()
for i, (a, b) in enumerate(id_ingredient_dict.items()):
    if b in ingredient_id_dict:
        duplicate_ingredient_names.add(b)
        if type(ingredient_id_dict[b]) == int:
            ingredient_id_dict[b] = [ingredient_id_dict[b], a]
            #ingredient_id_dict[b] = [ingredient_id_dict[b], a]
        else:
            ingredient_id_dict[b] += [a]
            #ingredient_id_dict[b] = tuple(list(ingredient_id_dict[b]) + [a])
    else:
        ingredient_id_dict[b] = a
print(f'@@@ Duplicate ingredients: ({len(duplicate_ingredient_names)}개) @@@')
for name in sorted(duplicate_ingredient_names):
    print(f"{str(ingredient_id_dict[name]):>17} {name}")
print()
print('number of non-duplicate ingredient names:', len(ingredient_id_dict))
print(dict(list(ingredient_id_dict.items())[:20]), '...')  # 20개만 출력

@@@ Duplicate ingredients: (19개) @@@
      [917, 3335] (10 oz.) frozen chopped spinach
      [585, 4634] bacon
[698, 1039, 4315] bread
     [3297, 5457] clams
     [5368, 5932] cream cheese
      [984, 4593] egg noodles
     [3403, 4936] frozen chopped spinach
     [3700, 5810] frozen lemonade concentrate
       [83, 3076] frozen orange juice concentrate
     [4999, 5908] green bell pepper
     [4047, 4091] lasagna noodles
     [2353, 4243] linguine
     [2882, 3311] mussels
      [343, 2798] pork chops
     [4945, 5095] red bell pepper
     [2938, 3099] spaghetti
     [1063, 4921] tortellini
     [5656, 6442] tuna
     [1660, 6510] water chestnuts

number of non-duplicate ingredient names: 6694
{'coca-cola': 0, 'vegan butter': 1, 'sourdough rolls': 2, 'reduced sodium refried beans': 3, 'ramen noodles': 4, 'crumbled corn bread': 5, 'japanese breadcrumbs': 6, 'toasted shredded coconut': 7, 'chinese spinach': 8, "Hellmann's® Real Mayonnaise": 9, 'ducklings': 10, 'basil olive oil': 11, 'w

### Data, Label lists를 binary np.array로 변환

In [73]:
def data_to_binary_array(data, dim):
    """ convert data(list of lists) into a 2D binary array. (for dataset, row = recipe) """
    """ dim (int) : dimension of each row (of 'enc') that must be. """
    enc = np.zeros((len(data), dim), dtype=int) 
    for i in range(len(data)):
        recipe = data[i]
        enc[i][recipe] = 1
    return enc

num_ingredients = len(ingredient_names)
num_cuisines = len(cuisine_names)

In [48]:
# Data
bin_data_train = data_to_binary_array(data_train, num_ingredients)
bin_data_valid_class = data_to_binary_array(data_valid_class, num_ingredients)
bin_data_valid_compl = data_to_binary_array(data_valid_compl, num_ingredients)
bin_data_test_class = data_to_binary_array(data_test_class, num_ingredients)
bin_data_test_compl = data_to_binary_array(data_test_compl, num_ingredients)

for x in [bin_data_train, bin_data_valid_class, bin_data_valid_compl, bin_data_test_class, bin_data_test_compl]:
    print(x.shape)

(23547, 6714)
(7848, 6714)
(7848, 6714)
(3924, 6714)
(3924, 6714)


In [49]:
# String이던 Label을 id로 바꾸기
int_labels_train = np.array([cuisine_id_dict[label] for label in labels_train])
int_labels_valid_class = np.array([cuisine_id_dict[label] for label in labels_valid_class])
int_labels_valid_compl = np.array(labels_valid_compl)  # 원래 int형.

for x in [int_labels_train, int_labels_valid_class, int_labels_valid_compl]:
    print(x.shape)

(23547,)
(7848,)
(7848,)


In [50]:
# int형 label을 one-hot으로 바꾸기
bin_labels_train = data_to_binary_array(int_labels_train, num_cuisines)
bin_labels_valid_class = data_to_binary_array(int_labels_valid_class, num_cuisines)
bin_labels_valid_compl = data_to_binary_array(labels_valid_compl, num_ingredients)

for x in [bin_labels_train, bin_labels_valid_class, bin_labels_valid_compl]:
    print(x.shape)

(23547, 20)
(7848, 20)
(7848, 6714)


### Dictionary를 pickle로 저장

In [15]:
# %mkdir {path_container}

In [16]:
for d, name in zip([id_cuisine_dict, cuisine_id_dict, id_ingredient_dict, ingredient_id_dict],
                   ['id_cuisine_dict', 'cuisine_id_dict', 'id_ingredient_dict', 'ingredient_id_dict']):
    print(len(d))
    with open(path_container + name +'.pickle', 'wb') as fw:
        pickle.dump(d, fw)

20
20
6714
6694


### np.ndarray를 h5py로 저장

In [17]:
_train = [bin_data_train, int_labels_train, bin_labels_train]
_valid_class = [bin_data_valid_class, int_labels_valid_class, bin_labels_valid_class]
_valid_compl = [bin_data_valid_compl, int_labels_valid_compl, bin_labels_valid_compl]
_test_class = [bin_data_test_class, None, None]
_test_compl = [bin_data_test_compl, None, None]

In [19]:
for (bin_data, int_labels, bin_labels), name in zip([_train, _valid_class, _valid_compl, _test_class, _test_compl],
                                                    ['train', 'valid_class', 'valid_compl', 'test_class', 'test_compl']):
    with h5py.File(path_container + name, 'w') as h5f:
        h5f.create_dataset('bin_data', data=bin_data, compression="gzip")
        if 'test_' not in name:
            h5f.create_dataset('int_labels', data=int_labels, compression="gzip")
            h5f.create_dataset('bin_labels', data=bin_labels, compression="gzip")

## 각 재료는 얼마나 등장할까?

In [36]:
# 매 epoch 마다 한 recipe는 한 번씩 뽑힌다. 그리고 completion에서는 그 중 하나의 재료씩 뽑힌다.
plain_appearance = np.zeros(6714, dtype=int)  # dataset에 등장한 횟수 세기
label_appearance = np.zeros(6714)             # completion label로 뽑힐 횟수 (in average, relative) 세기
total_cnt = 0
for recipe in data_train:
    for ingredient in recipe:
        plain_appearance[ingredient] += 1
        if len(recipe) >= 2:
            label_appearance[ingredient] += 1/len(recipe)
        total_cnt += 1

print('total_cnt', total_cnt)
print('non-appear ingred?', (plain_appearance==0).sum())

# 상위 20개만 보여주기
print('   plain    | label_appearance')
_i = 1
for (i, num1), (j, num2) in zip(sorted(list(enumerate(plain_appearance)), key=lambda x: -x[1]),
                   sorted(list(enumerate(label_appearance)), key=lambda x: -x[1])):
    print(f"{i:4d}: {num1:5d} | {j:4d}: {num2}")
    _i += 1
    if _i > 20:
        break


total_cnt 253459
non-appear ingred? 855
   plain    | label_appearance
 937: 10683 |  937: 1029.238668427132
5377:  4764 | 5377: 451.9161677791333
5536:  4685 | 2945: 428.95297773804896
2945:  4347 | 1308: 406.3646566533548
6187:  4329 | 5536: 395.38299815444026
1308:  3759 | 6187: 371.3897284542667
2122:  3713 | 2122: 325.11233138262673
2518:  2859 | 5648: 296.83767796850793
5648:  2809 | 4799: 273.021986953124
4799:  2739 | 2518: 252.05825911713433
2813:  2631 |  167: 241.8706030228799
 167:  2594 | 2813: 239.94656738638224
1476:  2007 | 1476: 204.08035320744452
3978:  1928 | 2809: 176.3161334401818
3653:  1861 | 5882: 174.30191173884472
  59:  1834 | 3978: 173.16011293968384
5884:  1808 | 3653: 172.29724214725792
2809:  1755 | 1679: 167.37086058738828
5882:  1658 | 5884: 161.3637390092757
5136:  1637 |   59: 161.3332578924371


In [37]:
label_weight = np.zeros(6714)

#inverse appearance, ignoring not appeared ingreds
label_weight[label_appearance > 0] = 1 / label_appearance[label_appearance > 0]
label_weight /= label_weight.sum()  # normalize: sum to be 1

assert label_weight.sum() == 1

print("1/6714 =", 1/6714)
print()
# 상위 20개만 보여주기
_i = 1
for i, num in sorted(list(enumerate(label_weight)), key=lambda x: -x[1]):
    print(f"{i:4d}: {num:.8f}")
    _i += 1
    if _i>20:
        break

1/6714 = 0.00014894250819183795

2826: 0.00183632
5514: 0.00183632
 967: 0.00173038
1227: 0.00173038
4280: 0.00123598
5522: 0.00123598
1512: 0.00113004
4041: 0.00113004
4189: 0.00102410
4910: 0.00102410
 653: 0.00098879
3522: 0.00098879
5096: 0.00095347
5826: 0.00095347
5401: 0.00091816
  11: 0.00088284
1195: 0.00088284
1581: 0.00088284
1800: 0.00088284
2494: 0.00088284


In [39]:
with open(path_container + 'label_weight_compl.pickle', 'wb') as fw:
        pickle.dump(label_weight, fw)

## 각 Cuisine은 얼마나 등장할까?

In [63]:
label_count_class = [np.count_nonzero(int_labels_train==i) for i in range(20)]
for i, _cnt in sorted(list(enumerate(label_count_class)), key=lambda x: -x[1]):
    print(f"{i:2d}: {_cnt}")

 9: 4678
13: 3836
16: 2515
 7: 1748
 3: 1599
 5: 1543
 2: 920
18: 903
11: 840
 6: 714
17: 590
14: 496
19: 487
 1: 485
12: 474
 4: 452
 8: 404
15: 300
 0: 283
10: 280


## Train set에서 Completion task용 dataset 만들기

In [75]:
data_train_compl = []
labels_train_compl = []
labels_train_compl_class = []
for recipe, label in zip(data_train, labels_train):
    for i in range(len(recipe)):
        data_train_compl.append(recipe[:i]+recipe[i+1:])
        labels_train_compl.append(recipe[i])
        labels_train_compl_class.append(cuisine_id_dict[label])

In [76]:

for i, (x, y, z) in enumerate(zip(data_train_compl, labels_train_compl, labels_train_compl_class)):
    print(x, y, z)
    if i>=20:
        break

[3146, 3229, 3885, 4379, 4390, 5250, 5456, 6187] 2813 6
[2813, 3229, 3885, 4379, 4390, 5250, 5456, 6187] 3146 6
[2813, 3146, 3885, 4379, 4390, 5250, 5456, 6187] 3229 6
[2813, 3146, 3229, 4379, 4390, 5250, 5456, 6187] 3885 6
[2813, 3146, 3229, 3885, 4390, 5250, 5456, 6187] 4379 6
[2813, 3146, 3229, 3885, 4379, 5250, 5456, 6187] 4390 6
[2813, 3146, 3229, 3885, 4379, 4390, 5456, 6187] 5250 6
[2813, 3146, 3229, 3885, 4379, 4390, 5250, 6187] 5456 6
[2813, 3146, 3229, 3885, 4379, 4390, 5250, 5456] 6187 6
[937, 1476, 2172, 2351, 2813, 3350, 3554, 3857, 3978, 5249, 5648] 392 4
[392, 1476, 2172, 2351, 2813, 3350, 3554, 3857, 3978, 5249, 5648] 937 4
[392, 937, 2172, 2351, 2813, 3350, 3554, 3857, 3978, 5249, 5648] 1476 4
[392, 937, 1476, 2351, 2813, 3350, 3554, 3857, 3978, 5249, 5648] 2172 4
[392, 937, 1476, 2172, 2813, 3350, 3554, 3857, 3978, 5249, 5648] 2351 4
[392, 937, 1476, 2172, 2351, 3350, 3554, 3857, 3978, 5249, 5648] 2813 4
[392, 937, 1476, 2172, 2351, 2813, 3554, 3857, 3978, 5249, 5648]

In [77]:
bin_data_train_compl = data_to_binary_array(data_train_compl, num_ingredients)
int_labels_train_compl = np.array(labels_train_compl)
int_labels_train_compl_class = np.array(labels_train_compl_class)

In [78]:
with h5py.File(path_container + 'train_compl', 'w') as h5f:
    h5f.create_dataset('bin_data', data=bin_data_train_compl, compression="gzip")
    h5f.create_dataset('int_labels', data=int_labels_train_compl, compression="gzip")
    h5f.create_dataset('bin_labels', data=int_labels_train_compl_class, compression="gzip")