In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm

In [2]:
with open('data.pkl', 'rb') as f:
    review = pickle.load(f)
item_meta = np.load('item.npy').item()

In [3]:
def IntegratedData(item_data, review_data, threshold = 15):
    data = dict()
    for key in list(review_data.keys()):
        item_list = []
        for item in list(review_data[key]):
            try:
                id_ = item['asin']
                img, price, category, _ = item_data[id_]
                item['img'], item['price'], item['category'] = img, price, category
                item_list.append(item)
            except:
                pass
        if len(item_list) >= threshold:
            data[key] = item_list
    return data

In [4]:
data = IntegratedData(item_meta, review, threshold=10)

In [None]:
with open('merge_data.pkl', 'wb') as f:
    pickle.dump(data, f)

# For count

In [None]:
count = dict()
for key in list(data.keys()):
    length = len(data[key])
    if length in count.keys():
        count[length] = count[length] +1
    else:
        count[length] = 1

In [None]:
count

In [None]:
num = 0
for i in count.keys():
    if i>=15:
        num += count[i]

In [None]:
num

# split train / val / test

In [3]:
with open('merge_data.pkl', 'rb') as f:
    data = pickle.load(f)

### category -> hierarchical

In [3]:
cate_level_0 = set()
cate_level_1 = set()
cate_level_2 = set()
cate_level_3 = set()
for user_id in data.keys():
    for item in data[user_id]:
        try:
            cate_level_0.add(item['category'][0][0])
            cate_level_1.add(item['category'][0][1])
            cate_level_2.add(item['category'][0][2])
            cate_level_3.add(item['category'][0][3])
        except:
            pass

In [4]:
len(cate_level_0),len(cate_level_1),len(cate_level_2),len(cate_level_3)

(2, 11, 80, 399)

In [5]:
len(list(data.keys()))

11934

In [6]:
def one_hot_dict(data_, category):
    series = set()
    for key_ in list(data_.keys()):
        for item_ in data_[key_]:
            for cate in item_[category][0][:3]:
                series.add(cate)
    result_dict = dict()

    for i, cell in enumerate(series):
        one_hot = np.zeros(len(series), dtype = np.float32)
        one_hot[i] = 1
        result_dict[cell] = one_hot
    
    return result_dict

In [7]:
# asin_dict = one_hot_dict(data, 'asin')
# brand_dict = one_hot_dict(data, 'brand')
category_dict = one_hot_dict(data, 'category')

brand, cate, img, price, review

In [8]:
def split_data(data_, category_dict, mode,
               n = 4, k = 2, val_num = 2, test_num = 2):
    if n <= k :
        print("invalid argument")
        return -1
    result_imgs, result_reviews, result_meta, result_asins = [], [], [], []
    category_len = len(category_dict.keys())
    
    for key_ in tqdm(list(data_.keys())):
        length = len(data_[key_])
        # this is temporary threshold. it need to modify after discussing
        # ex, if n = 5, k = 2, val = 3, test = 4, then length = 17, (0,10) (10,13) (13,17) 
        if length >= n + val_num + test_num + 1:
            num_instance = length - n + 1 
            if mode == 'train':
                start, end = 0, num_instance - (val_num + test_num)
            elif mode == 'val':
                start, end = num_instance - (val_num + test_num) , num_instance - test_num
            else:
                start, end = num_instance - test_num, num_instance 
            
            for num in range(start, end):
                tmp_imgs, tmp_reviews, tmp_meta, tmp_asin = [], [], [], []
                for count in range(0,n-k):
                    tmp_item = data_[key_][num + count]
                    tmp_imgs.append(tmp_item['img'])
                    tmp_reviews.append(tmp_item['reviewText'])
                    tmp_categories = np.zeros(category_len)
                    for tmp_category in tmp_item['category'][0][:3]:
                        tmp_categories += category_dict[tmp_category]
#                     tmp_brand = brand_dict[tmp_item['brand']]
                    tmp_price = np.array([tmp_item['price']/1000])
                    tmp_meta.append(np.concatenate([tmp_categories, tmp_price]))
                for count in range(n-k,n):
                    tmp_item = data_[key_][num + count]
                    tmp_asin.append(tmp_item['asin'])
#                     tmp_asin.append(asin_dict[tmp_item['asin']])
                result_imgs.append(np.array(tmp_imgs))
                result_reviews.append(np.array(tmp_reviews))
                result_meta.append(np.array(tmp_meta))
                result_asins.append(np.array(tmp_asin))
    return (result_imgs, result_reviews, result_meta, result_asins) 

In [None]:
train_dataset = split_data(data_ = data, category_dict = category_dict, mode = 'train', n = 10, k = 1, val_num = 5, test_num = 5)
# val_dataset = split_data(data_ = data , category_dict = category_dict, mode = 'val', n = 5, k = 1, val_num = 5, test_num = 5)
# test_dataset = split_data(data_ = data , category_dict = category_dict, mode = 'test', n = 5, k = 1, val_num = 5, test_num = 5)

 18%|█▊        | 2139/11934 [00:07<00:27, 357.98it/s]

In [7]:
len(val_dataset[0])

8340

In [8]:
with open('train.pkl', 'wb') as f:
    pickle.dump(val_dataset, f)

In [13]:
# n = 11, k = 1, val = 5, test = 5 => about 4GB / 1.4 GB / 1.4GB