In [1]:
import pickle
import h5py
import re
import tqdm
import numpy as np
import pandas as pd
import gzip

from collections import Counter
from keras.utils.np_utils import to_categorical

re_sc = re.compile('[\!@#$%\^&\*\(\)-=\[\]\{\}\.,/\?~\+\'"|]')

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Load the raw file

In [3]:
df = pd.read_csv('./drive/My Drive/CoE202TermProject/datasets/train.chunk.csv', encoding='utf-8')

df.head(5)

Unnamed: 0,pid,brand,model,maker,product,price,bcateid,mcateid,scateid,dcateid
0,O4486751463,퍼즐라이프,퍼즐라이프 직소퍼즐 바다거북의 여행,상품상세설명 참조,직소퍼즐 - 1000조각 바다거북의 여행 (PL1275),16520,1,1,2,-1
1,P3307178849,바보사랑,아이폰6S/6S+ tree farm101 - 다이어리케이스|아이폰6S/6S+,MORY|해당없음,[모리케이스]아이폰6S/6S+ tree farm101 - 다이어리케이스[바보사랑][...,20370,3,3,4,-1
2,R4424255515,크리비아,크리비아 기모 3부 속바지 GLG4314P,,크리비아 기모 3부 속바지 GLG4314P,-1,5,5,6,-1
3,F3334315393,잭앤질,[잭앤질] 남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA,㈜크리스패션,[하프클럽/잭앤질]남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA,16280,7,7,8,-1
4,N731678492,,SD코드프리혈당시험지[50매],기타,코드프리혈당시험지50매/코드프리시험지/최장유효기간,-1,10,9,11,-1


### Build label dictionary

In [4]:
y_vocab = {}                                          # Define dictionary for labels

sz = df['pid'].shape[0]                               # The number of data, 800,000 here
for i in tqdm.tqdm(range(sz), mininterval=1):         # tqdm predict the remaining time during for loop
    b = df['bcateid'][i]                              # Get category 1
    m = df['mcateid'][i]                              # Get category 2
    s = df['scateid'][i]                              # Get category 3
    d = df['dcateid'][i]                              # Get category 4
    class_name = '{}>{}>{}>{}'.format(b, m ,s, d)     # Combine four categories
    if class_name not in y_vocab:                     # If the combination of categories isn't in dictionary, add it to dictionary.
        y_vocab[class_name] = len(y_vocab)
    
y_dict = {y: idx for idx, y in enumerate(y_vocab)}
pickle.dump(y_dict, open('./drive/My Drive/CoE202TermProject/datasets/y_vocab.pickle', 'wb'), 2)

100%|██████████| 800000/800000 [00:41<00:00, 19340.44it/s]


### Parse the input data

In [0]:
def preprocess(df, save_label = True):
    sz = df['pid'].shape[0]
    products, labels = [], []

    for i in tqdm.tqdm(range(sz), mininterval=1):
        if save_label:
            b = df['bcateid'][i]
            m = df['mcateid'][i]
            s = df['scateid'][i]
            d = df['dcateid'][i]
            class_name = '{}>{}>{}>{}'.format(b, m ,s, d)
            
            # label
            labels_i = y_dict.get(class_name)
            labels.append(labels_i)
        
        else:
            labels.append(df['pid'][i])

        words_i = np.zeros(32, dtype=np.int32)
        word_counts_i = np.zeros(32, dtype=np.int8)

        feature = df['product'][i]                                # Get the product name
        if isinstance(feature, str) and not pd.isnull(feature):         # Check the product name is string or empty
            feature = re_sc.sub(' ', feature).strip().split()     # re_sc eliminate the special symbols (!@#$"...)
                                                                # strip() is a function to remove blank before and after of strings
                                                                # split() is the function of separating strings based on spaces.
            words = [w.strip() for w in feature]                  # Make seperated words to list
            words = [w for w in words                             
                    if len(w) >= 2 and len(w) < 31]               # If each word is less than 2 and more than 31 is discarded.

            x = [hash(w) % 100000 + 1 for w in words]             # Mapping words to integer using hash function
            xv = Counter(x).most_common(32)                       # Count the number of words in a string

            for j in range(len(xv)):
                words_i[j] = xv[j][0]                                   # Mapped integers of words
                word_counts_i[j] = xv[j][1]                                   # Counts of words
        
        brand = df['brand'][i]
        brand_i = 0
        if isinstance(brand, str) and not pd.isnull(brand) and brand and \
            "상세" not in brand and "참조" not in brand:

            # As brand is nearly one word, we can just use it
            brand = re_sc.sub('', brand)
            brand_i = hash(brand) % 100000 + 1


        model = df['model'][i]
        model_i = np.zeros(32, dtype=np.int32)
        model_counts_i = np.zeros(32, dtype=np.int8)
        if isinstance(model, str) and not pd.isnull(model) and model:
            model = re_sc.sub(' ', model).strip().split()
            words = [w.strip() for w in model]                  # Make seperated words to list
            words = [w for w in words                             
                    if len(w) >= 2 and len(w) < 31]
            
            x = [hash(w) % 100000 + 1 for w in words]             # Mapping words to integer using hash function
            xv = Counter(x).most_common(32)                       # Count the number of words in a string

            for j in range(len(xv)):
                model_i[j] = xv[j][0]                                   # Mapped integers of words
                model_counts_i[j] = xv[j][1]


        maker = df['maker'][i]
        maker_i = 0
        if isinstance(maker, str) and not pd.isnull(maker) and maker and \
            "상세" not in maker and "참조" not in maker:

            # As maker is nearly one word, we can just use it
            maker = re_sc.sub('', maker)
            maker_i = hash(maker) % 100000 + 1
        
        price = df['price'][i]
        price_i = 0

        if not pd.isnull(price):
            price_i = price
        
        products.append(np.asarray([
            brand_i, model_i, model_counts_i, maker_i,
            words_i, word_counts_i, price_i
        ]))
    
    # products = np.transpose(np.asarray(products))
    products = np.asarray(products)

    return products, np.asarray(labels)


### Save the input vectors

In [6]:
products, labels = preprocess(df)

train_dset = {
    'product': np.asarray(products), 'label': np.asarray(labels)
}

with gzip.open('./drive/My Drive/CoE202TermProject/datasets/train.chunk.pickle', 'wb') as f:
    pickle.dump(train_dset, f)

100%|██████████| 800000/800000 [02:51<00:00, 4668.93it/s]


### Parse the validation dataset

In [7]:
df = pd.read_csv('./drive/My Drive//CoE202TermProject/datasets/valid.chunk.csv')
products, labels = preprocess(df)

100%|██████████| 178830/178830 [00:40<00:00, 4462.55it/s]


### Save the validation vectors

In [0]:
valid_dset = {'product': np.asarray(products), 'label': np.asarray(labels)}

with gzip.open('./drive/My Drive/CoE202TermProject/datasets/valid.chunk.pickle', 'wb') as f:
    pickle.dump(valid_dset, f)

### Parse the test dataset

In [9]:
df = pd.read_csv('./drive/My Drive/CoE202TermProject/datasets/test.chunk.csv')
products, pids = preprocess(df, False)

100%|██████████| 178830/178830 [00:30<00:00, 5855.05it/s]


### Save the test vectors

In [0]:
test_dset = {'product': np.asarray(products), 'pids': np.asarray(pids)}

with gzip.open('./drive/My Drive/CoE202TermProject/datasets/test.chunk.pickle', 'wb') as f:
    pickle.dump(test_dset, f)

In [11]:
pids[0]

'T4364497649'