In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm
import random

In [18]:
with open('data.pkl', 'rb') as f:
    review = pickle.load(f)
item_meta = np.load('item.npy').item()

In [30]:
def IntegratedData(item_data, review_data, threshold = 15):
    data = dict()
    for key in list(review_data.keys()):
        item_list = []
        for item in list(review_data[key]):
            try:
                id_ = item['asin']
                img, price, category, _ = item_data[id_]
                item['img'], item['price'], item['category'] = img, price, category
                item_list.append(item)
            except:
                pass
        if len(item_list) >= threshold:
            data[key] = item_list
    return data

In [31]:
data = IntegratedData(item_meta, review, threshold=20)

In [5]:
with open('merge_data.pkl', 'wb') as f:
    pickle.dump(data, f)

In [33]:
len(list(data.keys()))

2140

# For count

In [6]:
count = dict()
for key in list(data.keys()):
    length = len(data[key])
    if length in count.keys():
        count[length] = count[length] +1
    else:
        count[length] = 1

In [7]:
count

{20: 253,
 21: 219,
 22: 195,
 23: 134,
 24: 143,
 25: 135,
 26: 102,
 27: 92,
 28: 72,
 29: 71,
 30: 51,
 31: 51,
 32: 47,
 33: 48,
 34: 45,
 35: 31,
 36: 32,
 37: 25,
 38: 37,
 39: 31,
 40: 17,
 41: 19,
 42: 18,
 43: 15,
 44: 17,
 45: 11,
 46: 24,
 47: 11,
 48: 10,
 49: 7,
 50: 13,
 51: 10,
 52: 12,
 53: 9,
 54: 7,
 55: 7,
 56: 7,
 57: 4,
 58: 10,
 59: 6,
 60: 4,
 61: 3,
 62: 3,
 63: 2,
 64: 5,
 65: 4,
 66: 5,
 67: 3,
 68: 2,
 70: 1,
 71: 3,
 72: 3,
 73: 2,
 75: 3,
 76: 1,
 77: 2,
 78: 2,
 80: 2,
 81: 2,
 82: 1,
 84: 2,
 85: 1,
 87: 4,
 89: 1,
 92: 1,
 93: 1,
 95: 3,
 96: 3,
 99: 3,
 100: 1,
 102: 2,
 104: 1,
 105: 1,
 109: 1,
 112: 1,
 117: 1,
 123: 1,
 124: 1,
 127: 1,
 129: 1,
 130: 1,
 131: 2,
 140: 1,
 145: 1,
 168: 1,
 197: 1,
 221: 1}

In [8]:
num = 0
for i in count.keys():
    if i>=15:
        num += count[i]

In [9]:
num

2140

# split train / val / test

In [10]:
with open('merge_data.pkl', 'rb') as f:
    data = pickle.load(f)

### category -> hierarchical

In [11]:
cate_level_0 = set()
cate_level_1 = set()
cate_level_2 = set()
cate_level_3 = set()
for user_id in data.keys():
    for item in data[user_id]:
        try:
            cate_level_0.add(item['category'][0][0])
            cate_level_1.add(item['category'][0][1])
            cate_level_2.add(item['category'][0][2])
            cate_level_3.add(item['category'][0][3])
        except:
            pass

In [12]:
len(cate_level_0),len(cate_level_1),len(cate_level_2),len(cate_level_3)

(1, 10, 74, 374)

In [34]:
def one_hot_dict(data_, category):
    series = set()
    for key_ in list(data_.keys()):
        for item_ in data_[key_]:
            if category == 'category':
                for cate in item_[category][0][:3]:
                    series.add(cate)
            else:
                series.add(item_[category])
    result_dict = dict()

    for i, cell in enumerate(series):
        one_hot = np.zeros(len(series), dtype = np.float32)
        one_hot[i] = 1
        result_dict[cell] = one_hot
    
    return result_dict

In [35]:
asin_dict = one_hot_dict(data, 'asin')
# brand_dict = one_hot_dict(data, 'brand')
category_dict = one_hot_dict(data, 'category')

In [36]:
asin_dict[list(asin_dict.keys())[0]]

array([ 1.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)

In [37]:
len(asin_dict[list(asin_dict.keys())[0]])

23273

In [38]:
len(category_dict[list(category_dict.keys())[0]])

84

In [39]:
del review
del item_meta

brand, cate, img, price, review

In [40]:
def split_data(data_, category_dict, mode, n = 4, k = 2, val_num = 2, test_num = 2):
    if n <= k :
        print("invalid argument")
        return -1
    result_imgs, result_reviews, result_meta, result_asins = [], [], [], []
    category_len = len(category_dict.keys())
    tqdm_ = 0
    
    key_list = list(data_.keys())
    random.shuffle(key_list)
    for key_ in key_list:
        length = len(data_[key_])
        # this is temporary threshold. it need to modify after discussing
        # ex, if n = 5, k = 2, val = 3, test = 4, then length = 17, (0,10) (10,13) (13,17) 
        if length >= n + val_num + test_num + 1:
            num_instance = length - n + 1 
            if mode == 'train':
                start, end = 0, num_instance - (val_num + test_num)
            elif mode == 'val':
                start, end = num_instance - (val_num + test_num) , num_instance - test_num
            else:
                start, end = num_instance - test_num, num_instance 
            
            for num in range(start, end):
                tmp_imgs, tmp_reviews, tmp_meta, tmp_asin = [], [], [], []
                for count in range(0,n-k):
                    tmp_item = data_[key_][num + count]
                    tmp_imgs.append(tmp_item['img'])
                    tmp_reviews.append(tmp_item['reviewText'])
                    tmp_categories = np.zeros(category_len)
                    for tmp_category in tmp_item['category'][0][:3]:
                        tmp_categories += category_dict[tmp_category]
#                     tmp_brand = brand_dict[tmp_item['brand']]
                    tmp_price = np.array([tmp_item['price']/1000])
                    tmp_meta.append(np.concatenate([tmp_categories, tmp_price]))
                for count in range(n-k,n):
                    tmp_item = data_[key_][num + count]
                    tmp_asin.append(tmp_item['asin'])
#                     tmp_asin.append(asin_dict[tmp_item['asin']])
                result_imgs.append(np.array(tmp_imgs))
                result_reviews.append(np.array(tmp_reviews))
                result_meta.append(np.array(tmp_meta))
                result_asins.append(np.array(tmp_asin))
                tqdm_ += 1
        
                if tqdm_ % 1000 == 0:
                    print(tqdm_,"instances saved!")
                    yield (result_imgs, result_reviews, result_meta, result_asins)
                    result_imgs, result_reviews, result_meta, result_asins = [], [], [], []
    print(tqdm_, "last instances saved!")
    yield (result_imgs, result_reviews, result_meta, result_asins) 

In [41]:
train_dataset = split_data(data_ = data, category_dict = category_dict, mode = 'train', n = 11, k = 1, val_num = 5, test_num = 5)
val_dataset = split_data(data_ = data , category_dict = category_dict, mode = 'val', n = 11, k = 1, val_num = 5, test_num = 5)
test_dataset = split_data(data_ = data , category_dict = category_dict, mode = 'test', n = 11, k = 1, val_num = 5, test_num = 5)

In [42]:
for index, train in enumerate(train_dataset):
    with open('data/train_'+str(index)+".pkl", 'wb') as f:
        pickle.dump(train, f)
    break

1000 instances saved!


In [30]:
for index, train in enumerate(train_dataset):
    with open('data/train_6.pkl', 'wb') as f:
        pickle.dump(train, f)
    break

6000 instances saved!


In [22]:
for index, train in enumerate(train_dataset):
    with open('train_'+str(index)+".pkl", 'wb') as f:
        pickle.dump(train, f)
del train
del train_dataset
for index, val in enumerate(val_dataset):
    with open('val_'+str(index)+".pkl", 'wb') as f:
        pickle.dump(val, f)
del val
del val_dataset
for index, test in enumerate(test_dataset):
    with open('test_'+str(index)+".pkl", 'wb') as f:
        pickle.dump(test, f)
del test
del test_dataset

2000 instances saved!
3000 instances saved!


OSError: [Errno 28] No space left on device

In [None]:
with open('train_0.pkl', 'rb') as f:
    train = pickle.load(f)
train[0][0].shape, train[1][0].shape, train[2][0].shape, train[3][0].shape