## Initialize

### Importing Libraries

In [0]:
import gzip
import numpy as np
import pandas as pd
import pickle
import re

from collections import Counter
from tqdm import tqdm

### Defining some constants

In [0]:
re_sc = re.compile(r"[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z0-9 ]")
base_path = '.'

### Mounting File System

In [3]:
from google.colab import drive
drive.mount('/content/drive')

base_path = './drive/My Drive/CoE202TermProject'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Building Label Dictionary

### Load Dataframe

In [4]:
df = pd.read_csv(
    base_path + '/datasets/train.chunk.csv',
    encoding='utf-8'
)

df.head(5)

Unnamed: 0,pid,brand,model,maker,product,price,bcateid,mcateid,scateid,dcateid
0,O4486751463,퍼즐라이프,퍼즐라이프 직소퍼즐 바다거북의 여행,상품상세설명 참조,직소퍼즐 - 1000조각 바다거북의 여행 (PL1275),16520,1,1,2,-1
1,P3307178849,바보사랑,아이폰6S/6S+ tree farm101 - 다이어리케이스|아이폰6S/6S+,MORY|해당없음,[모리케이스]아이폰6S/6S+ tree farm101 - 다이어리케이스[바보사랑][...,20370,3,3,4,-1
2,R4424255515,크리비아,크리비아 기모 3부 속바지 GLG4314P,,크리비아 기모 3부 속바지 GLG4314P,-1,5,5,6,-1
3,F3334315393,잭앤질,[잭앤질] 남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA,㈜크리스패션,[하프클럽/잭앤질]남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA,16280,7,7,8,-1
4,N731678492,,SD코드프리혈당시험지[50매],기타,코드프리혈당시험지50매/코드프리시험지/최장유효기간,-1,10,9,11,-1


### Build

In [5]:
y_vocab = {}

def item_to_class_name(df, i):
    b = df['bcateid'][i]
    m = df['mcateid'][i]
    s = df['scateid'][i]
    d = df['dcateid'][i]

    return '{}>{}>{}>{}'.format(b, m, s, d)

size = df['pid'].shape[0]
for i in tqdm(range(size)):
    class_name = item_to_class_name(df, i)
    if class_name not in y_vocab:
        y_vocab[class_name] = len(y_vocab)

y_dict = {y: idx for idx, y in enumerate(y_vocab)}

pickle.dump(y_dict, open(base_path + '/datasets/y_vocab.pickle', 'wb'), 2)

100%|██████████| 800000/800000 [00:36<00:00, 22061.35it/s]


## Preprocessing data

### Defining Helper Function

In [0]:
x_vocab = {}

def add_to_vocab(word):
    word_id = 0
    if word not in x_vocab:
        word_id = len(x_vocab)
        x_vocab[word] = [word_id, 1]
    
    else:
        word_item = x_vocab[word]
        word_item[1] += 1
        word_id = word_item[0]

    return word_id

def parse_word(word):
    word = re_sc.sub('', word)
    return add_to_vocab(word)

def parse_long_text(text, max_size=32):
    text = re_sc.sub(' ', text).strip()
    words = text.split()
    words = [w.strip() for w in words]
    words = [w for w in words if len(w) >= 2]

    if len(words) < 1:
        return []
        # return [], []
    
    x = [add_to_vocab(w) for w in words][:max_size]
    return list(x)

    # x_counter = Counter(x).most_common(max_size)
    # return zip(*x_counter)


### Defining Preprocessing Function

In [0]:
def preprocess(df, save_label = True):
    sz = df['pid'].shape[0]
    features, labels = [], []

    for i in tqdm(range(sz)):
        if save_label:
            class_name = item_to_class_name(df, i)
            labels.append(y_dict.get(class_name))
        
        else:
            labels.append(df['pid'][i])

        # Parsing 'brand'
        brand = df['brand'][i]
        brand_i = 0
        if isinstance(brand, str) and not pd.isnull(brand) and brand and \
            "상세" not in brand and "참조" not in brand:

            brand_i = parse_word(brand)
        
        # Parsing 'model'
        model = df['model'][i]
        model_i = np.zeros(16, dtype=np.int32)
        # model_counts_i = np.zeros(16, dtype=np.int8)
        if isinstance(model, str) and not pd.isnull(model) and model:
            # model_words, model_word_counts = parse_long_text(model, 16)
            model_words = parse_long_text(model, 16)

            for j in range(len(model_words)):
                model_i[j] = model_words[j]
                # model_counts_i[j] = model_word_counts[j]
        
        # Parsing 'maker'
        maker = df['maker'][i]
        maker_i = 0
        if isinstance(maker, str) and not pd.isnull(maker) and maker and \
            "상세" not in maker and "참조" not in maker:
            
            maker_i = parse_word(maker)
        
        # Parsing 'product'
        product = df['product'][i]
        product_i = np.zeros(32, dtype=np.int32)
        # product_counts_i = np.zeros(32, dtype=np.int8)
        if isinstance(product, str) and not pd.isnull(product) and product:
            # product_words, product_word_counts = parse_long_text(product)
            product_words = parse_long_text(product, 32)

            for j in range(len(product_words)):
                product_i[j] = product_words[j]
                # product_counts_i[j] = product_word_counts[j]

        # Parsing 'price'
        price = df['price'][i]
        price_i = 0

        if not pd.isnull(price):
            price_i = price
        
        features.append(np.asarray([
            brand_i,
            model_i,
            # model_i, model_counts_i,
            maker_i,
            product_i,
            # product_i, product_counts_i,
            price_i
        ]))

    return features, labels


### Preprocessing Train Set

In [8]:
features, labels = preprocess(df)
train_dset = {
    'features': np.asarray(features),
    'labels': np.asarray(labels)
}

with gzip.open(base_path + '/datasets/train.chunk.pickle', 'wb') as f:
    pickle.dump(train_dset, f)

100%|██████████| 800000/800000 [01:59<00:00, 6674.55it/s]


### Preprocessing Validation Set

In [9]:
df = pd.read_csv(
    base_path + '/datasets/valid.chunk.csv',
    encoding='utf-8'
)

features, labels = preprocess(df)
valid_dset = {
    'features': np.asarray(features),
    'labels': np.asarray(labels)
}

with gzip.open(base_path + '/datasets/valid.chunk.pickle', 'wb') as f:
    pickle.dump(valid_dset, f)

100%|██████████| 178830/178830 [00:27<00:00, 6615.69it/s]


### Preprocessing Test Set

In [10]:
df = pd.read_csv(
    base_path + '/datasets/test.chunk.csv',
    encoding='utf-8'
)

features, pids = preprocess(df, save_label = False)
valid_dset = {
    'features': np.asarray(features),
    'pids': np.asarray(pids)
}

with gzip.open(base_path + '/datasets/test.chunk.pickle', 'wb') as f:
    pickle.dump(valid_dset, f)

100%|██████████| 178830/178830 [00:19<00:00, 9138.51it/s]
