In [0]:
import pickle
import h5py
import re
import tqdm
import numpy as np
import pandas as pd
import gzip

from collections import Counter
from keras.utils.np_utils import to_categorical

re_sc = re.compile('[\!@#$%\^&\*\(\)-=\[\]\{\}\.,/\?~\+\'"|]')

Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


### Load the raw file

In [0]:
df = pd.read_csv('./drive/My Drive/Colab Notebooks/CoE202_KakaoArena/train.chunk.csv', encoding='utf-8')

df.head(5)

Unnamed: 0,pid,brand,model,maker,product,price,bcateid,mcateid,scateid,dcateid
0,O4486751463,퍼즐라이프,퍼즐라이프 직소퍼즐 바다거북의 여행,상품상세설명 참조,직소퍼즐 - 1000조각 바다거북의 여행 (PL1275),16520,1,1,2,-1
1,P3307178849,바보사랑,아이폰6S/6S+ tree farm101 - 다이어리케이스|아이폰6S/6S+,MORY|해당없음,[모리케이스]아이폰6S/6S+ tree farm101 - 다이어리케이스[바보사랑][...,20370,3,3,4,-1
2,R4424255515,크리비아,크리비아 기모 3부 속바지 GLG4314P,,크리비아 기모 3부 속바지 GLG4314P,-1,5,5,6,-1
3,F3334315393,잭앤질,[잭앤질] 남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA,㈜크리스패션,[하프클럽/잭앤질]남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA,16280,7,7,8,-1
4,N731678492,,SD코드프리혈당시험지[50매],기타,코드프리혈당시험지50매/코드프리시험지/최장유효기간,-1,10,9,11,-1


### Build label dictionary

In [0]:
y_vocab = {}                                          # Define dictionary for labels

sz = df['pid'].shape[0]                               # The number of data, 800,000 here
for i in tqdm.tqdm(range(sz), mininterval=1):         # tqdm predict the remaining time during for loop
    b = df['bcateid'][i]                              # Get category 1
    m = df['mcateid'][i]                              # Get category 2
    s = df['scateid'][i]                              # Get category 3
    d = df['dcateid'][i]                              # Get category 4
    class_name = '{}>{}>{}>{}'.format(b, m ,s, d)     # Combine four categories
    if class_name not in y_vocab:                     # If the combination of categories isn't in dictionary, add it to dictionary.
        y_vocab[class_name] = len(y_vocab)
    
y_dict = {y: idx for idx, y in enumerate(y_vocab)}
pickle.dump(y_dict, open('./drive/My Drive/Colab Notebooks/CoE202_KakaoArena/y_vocab.pickle', 'wb'), 2)

100%|██████████| 800000/800000 [00:35<00:00, 22522.77it/s]


### Parse the input data

In [0]:
sz = df['pid'].shape[0]
products, w_products, labels = [], [], []
for i in tqdm.tqdm(range(sz), mininterval=1):
    b = df['bcateid'][i]
    m = df['mcateid'][i]
    s = df['scateid'][i]
    d = df['dcateid'][i]
    class_name = '{}>{}>{}>{}'.format(b, m ,s, d)
    
    # label
    Y = y_dict.get(class_name)
    # Y = to_categorical(Y, len(y_dict))
    
    # product
    feature = df['product'][i]                                # Get the product name
    if isinstance(feature, str) and feature != 'nan':         # Check the product name is string or empty
        feature = re_sc.sub(' ', feature).strip().split()     # re_sc eliminate the special symbols (!@#$"...)
                                                              # strip() is a function to remove blank before and after of strings
                                                              # split() is the function of separating strings based on spaces.
        words = [w.strip() for w in feature]                  # Make seperated words to list
        words = [w for w in words                             
                if len(w) >= 2 and len(w) < 31]               # If each word is less than 2 and more than 31 is discarded.

        x = [hash(w) % 100000 + 1 for w in words]             # Mapping words to integer using hash function
        xv = Counter(x).most_common(32)                       # Count the number of words in a string

        x = np.zeros(32, dtype=np.int32)
        v = np.zeros(32, dtype=np.int8)

        for j in range(len(xv)):
            x[j] = xv[j][0]                                   # Mapped integers of words
            v[j] = xv[j][1]                                   # Counts of words
            
        products.append(x)
        w_products.append(v)
        labels.append(Y)

100%|██████████| 800000/800000 [01:10<00:00, 11288.17it/s]


### Save the input vectors

In [0]:
train_dset = {'product': np.asarray(products), 'w_product': np.asarray(w_products), 'label': np.asarray(labels)}

with gzip.open('./drive/My Drive/Colab Notebooks/CoE202_KakaoArena/train.chunk.pickle', 'wb') as f:
    pickle.dump(train_dset, f)

### Parse the validation dataset

In [0]:
df = pd.read_csv('./drive/My Drive/Colab Notebooks/CoE202_KakaoArena/valid.chunk.csv')
sz = df['pid'].shape[0]
products, w_products, labels = [], [], []
for i in tqdm.tqdm(range(sz), mininterval=1):
    b = df['bcateid'][i]
    m = df['mcateid'][i]
    s = df['scateid'][i]
    d = df['dcateid'][i]
    class_name = '{}>{}>{}>{}'.format(b, m ,s, d)
    
    # label
    Y = y_dict.get(class_name)
    # Y = to_categorical(Y, len(y_dict))
    
    # product
    feature = df['product'][i]
    if isinstance(feature, str) and feature != 'nan':
        feature = re_sc.sub(' ', feature).strip().split()
        words = [w.strip() for w in feature]
        words = [w for w in words
                if len(w) >= 2 and len(w) < 31]

        x = [hash(w) % 100000 + 1 for w in words]
        xv = Counter(x).most_common(32)

        x = np.zeros(32, dtype=np.int32)
        v = np.zeros(32, dtype=np.int8)

        for j in range(len(xv)):
            x[j] = xv[j][0]
            v[j] = xv[j][1]
            
        products.append(x)
        w_products.append(v)
        labels.append(Y)

100%|██████████| 178830/178830 [00:16<00:00, 11140.39it/s]


### Save the validation vectors

In [0]:
valid_dset = {'product': np.asarray(products), 'w_product': np.asarray(w_products), 'label': np.asarray(labels)}

with gzip.open('./drive/My Drive/Colab Notebooks/CoE202_KakaoArena/valid.chunk.pickle', 'wb') as f:
    pickle.dump(valid_dset, f)

### Parse the test dataset

In [0]:
df = pd.read_csv('./drive/My Drive/Colab Notebooks/CoE202_KakaoArena/test.chunk.csv')
sz = df['pid'].shape[0]
products, w_products, answer, pids = [], [], [], []
for i in tqdm.tqdm(range(sz), mininterval=1):

    # product
    feature = df['product'][i]
    pid = df['pid'][i]
    if isinstance(feature, str) and feature != 'nan':
        feature = re_sc.sub(' ', feature).strip().split()
        words = [w.strip() for w in feature]
        words = [w for w in words
                if len(w) >= 2 and len(w) < 31]

        x = [hash(w) % 100000 + 1 for w in words]
        xv = Counter(x).most_common(32)

        x = np.zeros(32, dtype=np.int32)
        v = np.zeros(32, dtype=np.int8)

        for j in range(len(xv)):
            x[j] = xv[j][0]
            v[j] = xv[j][1]
            
        products.append(x)
        w_products.append(v)
        pids.append(pid)

100%|██████████| 178830/178830 [00:09<00:00, 19276.99it/s]


### Save the test vectors

In [0]:
test_dset = {'product': np.asarray(products), 'w_product': np.asarray(w_products), 'pids': np.asarray(pids)}

with gzip.open('./drive/My Drive/Colab Notebooks/CoE202_KakaoArena/test.chunk.pickle', 'wb') as f:
    pickle.dump(test_dset, f)

In [0]:
pids[0]

'T4364497649'