In [1]:
# h5py 안 될 때 (M1 Mac)
#!brew reinstall hdf5
#!export CPATH="/opt/homebrew/include/"
#!export HDF5_DIR=/opt/homebrew/
#!python3 -m pip install h5py

In [2]:
import os
import csv
import pickle

import numpy as np
import h5py

In [3]:
path_root = '../'
path_container = './Container/'

In [4]:
path_train = os.path.join(path_root, "train.csv")
path_valid_class_que = os.path.join(path_root, "validation_classification_question.csv")
path_valid_class_ans = os.path.join(path_root, "validation_classification_answer.csv")
path_valid_compl_que = os.path.join(path_root, "validation_completion_question.csv")
path_valid_compl_ans = os.path.join(path_root, "validation_completion_answer.csv")
path_test_class_que = os.path.join(path_root, "test_classification_question.csv")
path_test_compl_que = os.path.join(path_root, "test_completion_question.csv")
path_ingredient_name = os.path.join(path_root, "node_ingredient.txt")

### Data 읽어서 list로 일단 저장

In [5]:
def read_train_data(path):
    """ Read train.csv and Return lists of data[int] / label[str]. """
    data = []
    labels = []
    with open(path, 'r') as f:
        for line in csv.reader(f):
            recipe = sorted(set(map(int, line[:-1])))  # a sorted list of recipe (integer) # 0417: 중복되는 재료 삭제 (3개 recipe)
            cuisine = line[-1]                    # which country? (string)
            data.append(recipe)
            labels.append(cuisine)
    return data, labels


def read_classification_data(question_path, answer_path=None):
    """ Read valid/test data for classification. Then return lists of data[int] / label[str]. """
    data = []
    labels = [] if answer_path is not None else None
    with open(question_path, 'r') as f:
        for line in csv.reader(f):
            recipe = sorted(set(list(map(int, line))))  # a sorted list of recipe (integer)  # 0417: 중복되는 재료 삭제
            data.append(recipe)
    if answer_path is not None:
        with open(answer_path, 'r') as f:
            for line in csv.reader(f):
                cuisine = line[0]            # which country? (string)
                labels.append(cuisine)
    return data, labels


def read_completion_data(question_path, answer_path=None):
    """ Read valid/test data for completion. Then return lists of data[int] / label[str]. """
    data = []
    labels = [] if answer_path is not None else None
    with open(question_path, 'r') as f:
        for line in csv.reader(f):
            recipe = sorted(set(map(int, line)))  # recipe without an ingredient
            data.append(recipe)
    if answer_path is not None:
        with open(answer_path, 'r') as f:
            for line in csv.reader(f):
                recipe = set(map(int, line))   # original recipe set
                missing = list(recipe - set(data[len(labels)]))[0]
                labels.append(missing)
    return data, labels


def read_ingredient_names(ingredient_path):
    ingredients_names = []
    with open(ingredient_path, 'r') as f:
        for line in csv.reader(f):
            ingredients_names.append(line[0])
    return ingredients_names

In [6]:
data_train_class, labels_train_class = read_train_data(path_train)  # classification-only dataset
data_valid_class, labels_valid_class = read_classification_data(path_valid_class_que, path_valid_class_ans)
data_valid_compl, labels_valid_compl = read_completion_data(path_valid_compl_que, path_valid_compl_ans)
data_test_class, _ = read_classification_data(path_test_class_que, None)
data_test_compl, _ = read_completion_data(path_test_compl_que, None)
ingredient_names = read_ingredient_names(path_ingredient_name)  # 재료 이름 (string)
cuisine_names = sorted(set(labels_train_class+labels_valid_class))    # Cuisine 이름 (string), 알파벳 순

In [7]:
# completion task용 dataset
data_train_compl = []
labels_train_compl = []
labels_train_compl_class = []
cnt = 0
for recipe, label in zip(data_train_class, labels_train_class):
    cnt += 1
    if len(recipe) > 1:
        for i in range(len(recipe)):
            data_train_compl.append(recipe[:i]+recipe[i+1:])
            labels_train_compl.append(recipe[i])
            labels_train_compl_class.append(label)
            #if cnt <=3:
                #print(recipe[:i]+recipe[i+1:], recipe[i], label)

### List (```ingredient_names```, ```cuisine_names```) 를 Dictionary로 변환

In [8]:
# Dict: id (int, 0~19) -> cuisine name (str, 알파벳 순)
id_cuisine_dict = dict(zip(range(len(cuisine_names)), cuisine_names))
print(id_cuisine_dict)

{0: 'brazilian', 1: 'british', 2: 'cajun_creole', 3: 'chinese', 4: 'filipino', 5: 'french', 6: 'greek', 7: 'indian', 8: 'irish', 9: 'italian', 10: 'jamaican', 11: 'japanese', 12: 'korean', 13: 'mexican', 14: 'moroccan', 15: 'russian', 16: 'southern_us', 17: 'spanish', 18: 'thai', 19: 'vietnamese'}


In [9]:
# Dict: cuisine name (str) -> id (int)
cuisine_id_dict = {b: a for a, b in id_cuisine_dict.items()}
print(cuisine_id_dict)

{'brazilian': 0, 'british': 1, 'cajun_creole': 2, 'chinese': 3, 'filipino': 4, 'french': 5, 'greek': 6, 'indian': 7, 'irish': 8, 'italian': 9, 'jamaican': 10, 'japanese': 11, 'korean': 12, 'mexican': 13, 'moroccan': 14, 'russian': 15, 'southern_us': 16, 'spanish': 17, 'thai': 18, 'vietnamese': 19}


In [10]:
# Dict: id (int, 0~6713) -> ingredient name (str, node_ingredient.txt 기준)
id_ingredient_dict = dict(zip(range(len(ingredient_names)), ingredient_names))
print(len(id_ingredient_dict.items()))
print(dict(list(id_ingredient_dict.items())[:20]), '...')  # 20개만 출력

6714
{0: 'coca-cola', 1: 'vegan butter', 2: 'sourdough rolls', 3: 'reduced sodium refried beans', 4: 'ramen noodles', 5: 'crumbled corn bread', 6: 'japanese breadcrumbs', 7: 'toasted shredded coconut', 8: 'chinese spinach', 9: "Hellmann's® Real Mayonnaise", 10: 'ducklings', 11: 'basil olive oil', 12: 'white baking bar', 13: 'rye whiskey', 14: 'mushroom broth', 15: 'meat loaf mix', 16: 'cocktail sauce', 17: 'asparagus spears', 18: 'nonfat greek yogurt', 19: 'cabernet sauvignon'} ...


In [11]:
# Dict: ingredient name (str) -> id (int, 0~6713)
ingredient_id_dict = dict()
duplicate_ingredient_names = set()
for i, (a, b) in enumerate(id_ingredient_dict.items()):
    if b in ingredient_id_dict:
        duplicate_ingredient_names.add(b)
        if type(ingredient_id_dict[b]) == int:
            ingredient_id_dict[b] = [ingredient_id_dict[b], a]
            #ingredient_id_dict[b] = [ingredient_id_dict[b], a]
        else:
            ingredient_id_dict[b] += [a]
            #ingredient_id_dict[b] = tuple(list(ingredient_id_dict[b]) + [a])
    else:
        ingredient_id_dict[b] = a
print(f'@@@ Duplicate ingredients: ({len(duplicate_ingredient_names)}개) @@@')
for name in sorted(duplicate_ingredient_names):
    print(f"{str(ingredient_id_dict[name]):>17} {name}")
print()
print('number of non-duplicate ingredient names:', len(ingredient_id_dict))
print(dict(list(ingredient_id_dict.items())[:20]), '...')  # 20개만 출력

@@@ Duplicate ingredients: (19개) @@@
      [917, 3335] (10 oz.) frozen chopped spinach
      [585, 4634] bacon
[698, 1039, 4315] bread
     [3297, 5457] clams
     [5368, 5932] cream cheese
      [984, 4593] egg noodles
     [3403, 4936] frozen chopped spinach
     [3700, 5810] frozen lemonade concentrate
       [83, 3076] frozen orange juice concentrate
     [4999, 5908] green bell pepper
     [4047, 4091] lasagna noodles
     [2353, 4243] linguine
     [2882, 3311] mussels
      [343, 2798] pork chops
     [4945, 5095] red bell pepper
     [2938, 3099] spaghetti
     [1063, 4921] tortellini
     [5656, 6442] tuna
     [1660, 6510] water chestnuts

number of non-duplicate ingredient names: 6694
{'coca-cola': 0, 'vegan butter': 1, 'sourdough rolls': 2, 'reduced sodium refried beans': 3, 'ramen noodles': 4, 'crumbled corn bread': 5, 'japanese breadcrumbs': 6, 'toasted shredded coconut': 7, 'chinese spinach': 8, "Hellmann's® Real Mayonnaise": 9, 'ducklings': 10, 'basil olive oil': 11, 'w

### Data, Label lists를 np.array로 변환

In [12]:
def data_to_binary_array(data, dim):
    """ convert data(list of lists) into a 2D binary array. (for dataset, row = recipe) """
    """ dim (int) : dimension of each row (of 'enc') that must be. """
    enc = np.zeros((len(data), dim), dtype=int) 
    for i in range(len(data)):
        recipe = data[i]
        enc[i][recipe] = 1
    return enc

num_ingredients = len(ingredient_names)
num_cuisines = len(cuisine_names)

In [13]:
# Data + labels_valid_compl
bin_data_train_class = data_to_binary_array(data_train_class, num_ingredients)
bin_data_train_compl = data_to_binary_array(data_train_compl, num_ingredients)
bin_data_valid_class = data_to_binary_array(data_valid_class, num_ingredients)
bin_data_valid_compl = data_to_binary_array(data_valid_compl, num_ingredients)
bin_data_test_class = data_to_binary_array(data_test_class, num_ingredients)
bin_data_test_compl = data_to_binary_array(data_test_compl, num_ingredients)

for x in [bin_data_train_class, bin_data_train_compl, bin_data_valid_class, bin_data_valid_compl, bin_data_test_class, bin_data_test_compl]:
    print(x.shape)

(23547, 6714)
(253419, 6714)
(7848, 6714)
(7848, 6714)
(3924, 6714)
(3924, 6714)


In [14]:
# String이던 Label을 id로 바꾸기 & int list를 int array로 바꾸기
int_labels_train_class = np.array([cuisine_id_dict[label] for label in labels_train_class])
int_labels_train_compl = np.array(labels_train_compl)
int_labels_train_compl_class = np.array([cuisine_id_dict[label] for label in labels_train_compl_class])
int_labels_valid_class = np.array([cuisine_id_dict[label] for label in labels_valid_class])
int_labels_valid_compl = np.array(labels_valid_compl)

for x in [int_labels_train_class, int_labels_train_compl, int_labels_train_compl_class, int_labels_valid_class, int_labels_valid_compl]:
    print(x.shape)

(23547,)
(253419,)
(253419,)
(7848,)
(7848,)


### Dictionary를 pickle로 저장

In [15]:
# %mkdir {path_container}

In [16]:
for d, name in zip([id_cuisine_dict, cuisine_id_dict, id_ingredient_dict, ingredient_id_dict],
                   ['id_cuisine_dict', 'cuisine_id_dict', 'id_ingredient_dict', 'ingredient_id_dict']):
    print(len(d))
    with open(path_container + name +'.pickle', 'wb') as fw:
        pickle.dump(d, fw)

20
20
6714
6694


### np.ndarray를 h5py로 저장

In [17]:
# train_class
with h5py.File(path_container + 'train_class', 'w') as h5f:
    h5f.create_dataset('bin_data', data=bin_data_train_class, compression="gzip")
    h5f.create_dataset('label_class', data=int_labels_train_class, compression="gzip")

In [18]:
# train_compl
with h5py.File(path_container + 'train_compl', 'w') as h5f:
    h5f.create_dataset('bin_data', data=bin_data_train_compl, compression="gzip")
    h5f.create_dataset('label_class', data=int_labels_train_compl_class, compression="gzip")
    h5f.create_dataset('label_compl', data=int_labels_train_compl, compression="gzip")

In [19]:
# valid_class
with h5py.File(path_container + 'valid_class', 'w') as h5f:
    h5f.create_dataset('bin_data', data=bin_data_valid_class, compression="gzip")
    h5f.create_dataset('label_class', data=int_labels_valid_class, compression="gzip")

In [20]:
# valid_compl
with h5py.File(path_container + 'valid_compl', 'w') as h5f:
    h5f.create_dataset('bin_data', data=bin_data_valid_compl, compression="gzip")
    h5f.create_dataset('label_compl', data=int_labels_valid_compl, compression="gzip")

In [21]:
# test_class
with h5py.File(path_container + 'test_class', 'w') as h5f:
    h5f.create_dataset('bin_data', data=bin_data_test_class, compression="gzip")

In [22]:
# test_compl
with h5py.File(path_container + 'test_class', 'w') as h5f:
    h5f.create_dataset('bin_data', data=bin_data_test_compl, compression="gzip")

## 각 재료는 얼마나 등장할까?

In [23]:
# 매 epoch 마다 한 recipe는 한 번씩 뽑힌다. 그리고 completion에서는 그 중 하나의 재료씩 뽑힌다.
plain_appearance = np.zeros(6714, dtype=int)  # dataset에 등장한 횟수 세기
label_appearance = np.zeros(6714)             # completion label로 뽑힐 횟수 (in average, relative) 세기
total_cnt = 0
for recipe in data_train_class:
    for ingredient in recipe:
        plain_appearance[ingredient] += 1
        if len(recipe) >= 2:
            label_appearance[ingredient] += 1/len(recipe)
        total_cnt += 1

print('total_cnt', total_cnt)

# 상위 20개만 보여주기
print('   plain    | label_appearance(relative)')
_i = 1
for (i, num1), (j, num2) in zip(sorted(list(enumerate(plain_appearance)), key=lambda x: -x[1]),
                   sorted(list(enumerate(label_appearance)), key=lambda x: -x[1])):
    print(f"{i:4d}: {num1:5d} | {j:4d}: {num2}")
    _i += 1
    if _i > 20:
        break


total_cnt 253438
   plain    | label_appearance(relative)
 937: 10682 |  937: 1029.238668427132
5377:  4763 | 5377: 451.9161677791333
5536:  4685 | 2945: 428.95297773804896
2945:  4347 | 1308: 406.3646566533548
6187:  4329 | 5536: 395.38299815444026
1308:  3759 | 6187: 371.3897284542667
2122:  3712 | 2122: 325.11233138262673
2518:  2858 | 5648: 296.81267796850796
5648:  2808 | 4799: 273.021986953124
4799:  2739 | 2518: 252.05825911713433
2813:  2631 |  167: 241.8706030228799
 167:  2594 | 2813: 239.94656738638224
1476:  2007 | 1476: 204.08035320744452
3978:  1928 | 2809: 176.3161334401818
3653:  1861 | 5882: 174.30191173884472
  59:  1834 | 3978: 173.16011293968384
5884:  1808 | 3653: 172.29724214725792
2809:  1755 | 1679: 167.37086058738828
5882:  1658 | 5884: 161.3637390092757
5136:  1637 |   59: 161.3332578924371


In [24]:
print('non-appear ingred?', (plain_appearance==0).sum())
print(np.arange(6714, dtype=int)[plain_appearance==0])

non-appear ingred? 855
[   2    5   15   23   42   54   72   73   79   89  108  113  122  124
  128  131  133  140  146  151  158  159  160  189  190  192  196  206
  219  232  235  267  285  288  312  313  315  336  339  353  370  382
  403  406  408  419  430  433  435  437  441  443  445  447  448  454
  459  463  465  466  485  487  493  500  506  507  511  523  556  560
  564  568  571  573  575  595  598  601  606  608  612  626  639  644
  655  668  678  679  683  697  699  707  710  713  723  730  735  747
  752  755  772  796  797  800  809  812  813  824  834  840  841  858
  887  893  894  896  901  911  915  930  942  948  957  960  969  973
  978  983  989  995  996 1008 1010 1034 1038 1042 1061 1067 1070 1071
 1093 1096 1113 1126 1128 1132 1145 1154 1157 1164 1178 1183 1199 1220
 1223 1226 1255 1261 1284 1295 1303 1304 1306 1313 1317 1351 1355 1358
 1378 1384 1385 1387 1391 1397 1399 1402 1406 1408 1417 1428 1441 1444
 1451 1455 1469 1477 1491 1501 1508 1526 1537 1540 154

In [25]:
# train data에서 안 보이던 재료가 validation이나 test set에서는 나타날까?
plain_appearance_valid = np.zeros(6714, dtype=int)
for recipe in data_valid_class:
    for ingredient in recipe:
        plain_appearance_valid[ingredient] += 1
        
d1 = {}
for i in np.arange(6714, dtype=int)[plain_appearance==0]:
    d1[str(i)] = plain_appearance_valid[i]
print(f'최대 {max(d1.values())}번.')
#print(d1)

최대 5번.


In [26]:
# train data에서 안 보이던 재료가 test set에서는 나타날까?
plain_appearance_test = np.zeros(6714, dtype=int)
for recipe in data_test_class:
    for ingredient in recipe:
        plain_appearance_test[ingredient] += 1
        
d2 = {}
for i in np.arange(6714, dtype=int)[plain_appearance==0]:
    d2[str(i)] = plain_appearance_test[i]
print(f'최대 {max(d2.values())}번.')
#print(d2)

최대 3번.


In [27]:
# 아무 데이터셋에도 나타나지 않는 재료가 있을까?
nowhere_ingred_idx = np.logical_and(np.logical_and(plain_appearance==0, plain_appearance_valid==0), plain_appearance_test==0)
nowhere_ingred = np.arange(6714)[nowhere_ingred_idx]
print('개수:', len(nowhere_ingred))
#print(nowhere_ingred)

개수: 192


In [28]:
# test set에만 있는 재료도 있을까?
only_test_ingred_idx = np.logical_and(np.logical_and(plain_appearance==0, plain_appearance_valid==0), plain_appearance_test!=0)
only_test_ingred = np.arange(6714)[only_test_ingred_idx]
print('개수:', len(only_test_ingred))
#print(only_test_ingred)

개수: 218


In [29]:
# valid set에만 있는 재료도 있을까?
only_valid_ingred_idx = np.logical_and(np.logical_and(plain_appearance==0, plain_appearance_test==0), plain_appearance_valid!=0)
only_valid_ingred = np.arange(6714)[only_valid_ingred_idx]
print('개수:', len(only_valid_ingred))
#print(only_valid_ingred)

개수: 401


In [30]:
# train set에만 있는 재료도 있을까?
only_train_ingred_idx = np.logical_and(np.logical_and(plain_appearance_test==0, plain_appearance_valid==0), plain_appearance!=0)
only_train_ingred = np.arange(6714)[only_train_ingred_idx]
print('개수:', len(only_train_ingred))
#print(only_train_ingred)

개수: 1832


In [31]:
# 모두 나타나는 재료는 얼마나 될까.
everywhere_ingred_idx = np.logical_and(np.logical_and(plain_appearance_test!=0, plain_appearance_valid!=0), plain_appearance!=0)
everywhere_ingred = np.arange(6714)[everywhere_ingred_idx]
print('개수:', len(everywhere_ingred))
#print(everywhere_ingred)

개수: 2568


In [32]:
# 요약
where_ingred = {}
ox = lambda x: 'O' if x else '-'
print("train | valid |  test | how many")
for in_train in [True, False]:
    for in_val in [True, False]:
        for in_test in [True, False]:
            idx = np.logical_and(np.logical_and((plain_appearance!=0) if in_train else (plain_appearance==0),
                                                (plain_appearance_valid!=0) if in_val else (plain_appearance_valid==0)),
                                                (plain_appearance_test!=0) if in_test else (plain_appearance_test==0))
            arr = np.arange(6714)[idx]
            where_ingred[(in_train, in_val, in_test)] = arr
            print(f"  {ox(in_train)}   |   {ox(in_val)}   |   {ox(in_test)}   |   {len(arr):4d}")

train | valid |  test | how many
  O   |   O   |   O   |   2568
  O   |   O   |   -   |   1073
  O   |   -   |   O   |    386
  O   |   -   |   -   |   1832
  -   |   O   |   O   |     44
  -   |   O   |   -   |    401
  -   |   -   |   O   |    218
  -   |   -   |   -   |    192


In [33]:
#with open(path_container + 'label_weight_compl.pickle', 'wb') as fw:
#    pickle.dump(label_weight, fw)

## 각 Cuisine은 얼마나 등장할까?

In [34]:
label_count_class = [np.count_nonzero(int_labels_train_class==i) for i in range(20)]
for i, _cnt in sorted(list(enumerate(label_count_class)), key=lambda x: -x[1]):
    print(f"{i:2d}: {_cnt}")

 9: 4678
13: 3836
16: 2515
 7: 1748
 3: 1599
 5: 1543
 2: 920
18: 903
11: 840
 6: 714
17: 590
14: 496
19: 487
 1: 485
12: 474
 4: 452
 8: 404
15: 300
 0: 283
10: 280
