In [1]:
import pickle
import h5py
import re
import tqdm
import numpy as np
import pandas as pd
import gzip

from collections import Counter
from keras.utils.np_utils import to_categorical

re_sc = re.compile('[\!@#$%\^&\*\(\)-=\[\]\{\}\.,/\?~\+\'"|]')

Using TensorFlow backend.


### Load the raw file

In [3]:
df = pd.read_csv('./datasets/train.chunk.csv', encoding='utf-8')

df.head(5)

Unnamed: 0,pid,brand,model,maker,product,price,bcateid,mcateid,scateid,dcateid
0,O4486751463,퍼즐라이프,퍼즐라이프 직소퍼즐 바다거북의 여행,상품상세설명 참조,직소퍼즐 - 1000조각 바다거북의 여행 (PL1275),16520,1,1,2,-1
1,P3307178849,바보사랑,아이폰6S/6S+ tree farm101 - 다이어리케이스|아이폰6S/6S+,MORY|해당없음,[모리케이스]아이폰6S/6S+ tree farm101 - 다이어리케이스[바보사랑][...,20370,3,3,4,-1
2,R4424255515,크리비아,크리비아 기모 3부 속바지 GLG4314P,,크리비아 기모 3부 속바지 GLG4314P,-1,5,5,6,-1
3,F3334315393,잭앤질,[잭앤질] 남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA,㈜크리스패션,[하프클럽/잭앤질]남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA,16280,7,7,8,-1
4,N731678492,,SD코드프리혈당시험지[50매],기타,코드프리혈당시험지50매/코드프리시험지/최장유효기간,-1,10,9,11,-1


### Build label dictionary

In [5]:
y_vocab = {}                                          # Define dictionary for labels

sz = df['pid'].shape[0]                               # The number of data, 800,000 here
for i in tqdm.tqdm(range(sz), mininterval=1):         # tqdm predict the remaining time during for loop
    b = df['bcateid'][i]                              # Get category 1
    m = df['mcateid'][i]                              # Get category 2
    s = df['scateid'][i]                              # Get category 3
    d = df['dcateid'][i]                              # Get category 4
    class_name = '{}>{}>{}>{}'.format(b, m ,s, d)     # Combine four categories
    if class_name not in y_vocab:                     # If the combination of categories isn't in dictionary, add it to dictionary.
        y_vocab[class_name] = len(y_vocab)
    
y_dict = {y: idx for idx, y in enumerate(y_vocab)}
pickle.dump(y_dict, open('./datasets/y_vocab.pickle', 'wb'), 2)


  0%|                                                                                           | 0/800000 [00:00<?, ?it/s][A
  2%|█▉                                                                          | 19958/800000 [00:01<00:39, 19942.97it/s][A
  5%|████                                                                        | 42155/800000 [00:02<00:36, 20567.28it/s][A
  8%|██████▏                                                                     | 64767/800000 [00:03<00:34, 21135.98it/s][A
 69%|███████████████████████████████████████████████████▌                       | 549386/800000 [00:40<00:11, 21471.83it/s][A
 13%|██████████                                                                 | 107352/800000 [00:05<00:32, 21237.61it/s][A
 16%|████████████                                                               | 129079/800000 [00:06<00:31, 21381.07it/s][A
 19%|██████████████                                                             | 149558/800000 [00:07<00:30, 

### Parse the input data

In [6]:
sz = df['pid'].shape[0]
products, w_products, labels = [], [], []
for i in tqdm.tqdm(range(sz), mininterval=1):
    b = df['bcateid'][i]
    m = df['mcateid'][i]
    s = df['scateid'][i]
    d = df['dcateid'][i]
    class_name = '{}>{}>{}>{}'.format(b, m ,s, d)
    
    # label
    Y = y_dict.get(class_name)
    # Y = to_categorical(Y, len(y_dict))
    
    # product
    feature = df['product'][i]                                # Get the product name
    if isinstance(feature, str) and feature != 'nan':         # Check the product name is string or empty
        feature = re_sc.sub(' ', feature).strip().split()     # re_sc eliminate the special symbols (!@#$"...)
                                                              # strip() is a function to remove blank before and after of strings
                                                              # split() is the function of separating strings based on spaces.
        words = [w.strip() for w in feature]                  # Make seperated words to list
        words = [w for w in words                             
                if len(w) >= 2 and len(w) < 31]               # If each word is less than 2 and more than 31 is discarded.

        x = [hash(w) % 100000 + 1 for w in words]             # Mapping words to integer using hash function
        xv = Counter(x).most_common(32)                       # Count the number of words in a string

        x = np.zeros(32, dtype=np.int32)
        v = np.zeros(32, dtype=np.int8)

        for j in range(len(xv)):
            x[j] = xv[j][0]                                   # Mapped integers of words
            v[j] = xv[j][1]                                   # Counts of words
            
        products.append(x)
        w_products.append(v)
        labels.append(Y)


  0%|                                                                                           | 0/800000 [00:00<?, ?it/s][A
  1%|█                                                                           | 10995/800000 [00:01<01:11, 10991.94it/s][A
  3%|██                                                                          | 21631/800000 [00:02<01:11, 10881.05it/s][A
  4%|███                                                                         | 32240/800000 [00:03<01:11, 10795.29it/s][A
  5%|████▏                                                                       | 43773/800000 [00:04<01:08, 11005.79it/s][A
  7%|█████▏                                                                      | 54902/800000 [00:05<01:07, 11041.58it/s][A
  8%|██████▎                                                                     | 65836/800000 [00:06<01:06, 11006.69it/s][A
 10%|███████▎                                                                    | 77245/800000 [00:07<01:04, 

 91%|███████████████████████████████████████████████████████████████████▉       | 725189/800000 [01:04<00:06, 11154.83it/s][A
 92%|█████████████████████████████████████████████████████████████████████      | 736601/800000 [01:05<00:05, 11228.34it/s][A
 94%|██████████████████████████████████████████████████████████████████████▏    | 748177/800000 [01:06<00:04, 11328.43it/s][A
 95%|███████████████████████████████████████████████████████████████████████▏   | 759512/800000 [01:07<00:03, 11251.07it/s][A
 96%|████████████████████████████████████████████████████████████████████████▎  | 770907/800000 [01:08<00:02, 11291.61it/s][A
 98%|█████████████████████████████████████████████████████████████████████████▎ | 782203/800000 [01:09<00:01, 11181.08it/s][A
100%|███████████████████████████████████████████████████████████████████████████| 800000/800000 [01:11<00:00, 11195.82it/s][A


### Save the input vectors

In [7]:
train_dset = {'product': np.asarray(products), 'w_product': np.asarray(w_products), 'label': np.asarray(labels)}

with gzip.open('./datasets/train.chunk.pickle', 'wb') as f:
    pickle.dump(train_dset, f)

### Parse the validation dataset

In [8]:
df = pd.read_csv('./datasets/valid.chunk.csv')
sz = df['pid'].shape[0]
products, w_products, labels = [], [], []
for i in tqdm.tqdm(range(sz), mininterval=1):
    b = df['bcateid'][i]
    m = df['mcateid'][i]
    s = df['scateid'][i]
    d = df['dcateid'][i]
    class_name = '{}>{}>{}>{}'.format(b, m ,s, d)
    
    # label
    Y = y_dict.get(class_name)
    # Y = to_categorical(Y, len(y_dict))
    
    # product
    feature = df['product'][i]
    if isinstance(feature, str) and feature != 'nan':
        feature = re_sc.sub(' ', feature).strip().split()
        words = [w.strip() for w in feature]
        words = [w for w in words
                if len(w) >= 2 and len(w) < 31]

        x = [hash(w) % 100000 + 1 for w in words]
        xv = Counter(x).most_common(32)

        x = np.zeros(32, dtype=np.int32)
        v = np.zeros(32, dtype=np.int8)

        for j in range(len(xv)):
            x[j] = xv[j][0]
            v[j] = xv[j][1]
            
        products.append(x)
        w_products.append(v)
        labels.append(Y)


  0%|                                                                                           | 0/178830 [00:00<?, ?it/s][A
  6%|████▌                                                                       | 10652/178830 [00:01<00:15, 10642.10it/s][A
 12%|█████████▏                                                                  | 21640/178830 [00:02<00:14, 10740.55it/s][A
 18%|██████████████                                                              | 32948/178830 [00:03<00:13, 10902.69it/s][A
 25%|██████████████████▋                                                         | 43872/178830 [00:04<00:12, 10906.26it/s][A
 31%|███████████████████████▍                                                    | 55168/178830 [00:05<00:11, 11019.60it/s][A
 37%|████████████████████████████▎                                               | 66646/178830 [00:06<00:10, 11152.21it/s][A
 44%|█████████████████████████████████▏                                          | 78152/178830 [00:07<00:08, 

### Save the validation vectors

In [9]:
valid_dset = {'product': np.asarray(products), 'w_product': np.asarray(w_products), 'label': np.asarray(labels)}

with gzip.open('./datasets/valid.chunk.pickle', 'wb') as f:
    pickle.dump(valid_dset, f)

### Parse the test dataset

In [10]:
df = pd.read_csv('./datasets/test.chunk.csv')
sz = df['pid'].shape[0]
products, w_products, answer, pids = [], [], [], []
for i in tqdm.tqdm(range(sz), mininterval=1):

    # product
    feature = df['product'][i]
    pid = df['pid'][i]
    if isinstance(feature, str) and feature != 'nan':
        feature = re_sc.sub(' ', feature).strip().split()
        words = [w.strip() for w in feature]
        words = [w for w in words
                if len(w) >= 2 and len(w) < 31]

        x = [hash(w) % 100000 + 1 for w in words]
        xv = Counter(x).most_common(32)

        x = np.zeros(32, dtype=np.int32)
        v = np.zeros(32, dtype=np.int8)

        for j in range(len(xv)):
            x[j] = xv[j][0]
            v[j] = xv[j][1]
            
        products.append(x)
        w_products.append(v)
        pids.append(pid)


  0%|                                                                                           | 0/178830 [00:00<?, ?it/s][A
 11%|████████▏                                                                   | 19249/178830 [00:01<00:08, 19245.31it/s][A
 22%|████████████████▊                                                           | 39590/178830 [00:02<00:07, 19555.83it/s][A
 33%|█████████████████████████                                                   | 59046/178830 [00:03<00:06, 19523.30it/s][A
 43%|████████████████████████████████▋                                           | 77042/178830 [00:04<00:05, 19033.03it/s][A
 54%|█████████████████████████████████████████▎                                  | 97092/178830 [00:05<00:04, 19326.86it/s][A
 66%|█████████████████████████████████████████████████▏                         | 117326/178830 [00:06<00:03, 19585.08it/s][A
 77%|█████████████████████████████████████████████████████████▌                 | 137344/178830 [00:07<00:02, 

### Save the test vectors

In [11]:
test_dset = {'product': np.asarray(products), 'w_product': np.asarray(w_products), 'pids': np.asarray(pids)}

with gzip.open('./datasets/test.chunk.pickle', 'wb') as f:
    pickle.dump(test_dset, f)

In [12]:
pids[0]

'T4364497649'