## Embedding Layler

In [20]:
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import os
import params

embed_lookup = KeyedVectors.load_word2vec_format(params.w2v_path, 
                                                 binary=True)

print('변경 전')
print(embed_lookup.vectors.shape)
# 
# # <pad> vector, index 추가
# 
pad_vectors = np.zeros_like(embed_lookup.vectors[0])
embed_lookup.vectors = np.insert(embed_lookup.vectors, 0, pad_vectors,axis=0)
embed_lookup.index2word.insert(0,'<pad>')
print('변경 후')
print(embed_lookup.vectors.shape)

변경 전
(41721, 200)
변경 후
(41722, 200)


In [21]:
# store pretrained vocab
pretrained_words = []
for word in embed_lookup.vocab:
    pretrained_words.append(word)

pretrained_words.insert(0,'<pad>')

In [22]:
row_idx = 1

# get word/embedding in that row
word = pretrained_words[row_idx] # get words by index
embedding = embed_lookup[word] # embeddings by word

# vocab and embedding info
print("Size of Vocab: {}\n".format(len(pretrained_words)))
print('Word in vocab: {}\n'.format(word))
print('Length of embedding: {}\n'.format(len(embedding)))
#print('Associated embedding: \n', embedding)

Size of Vocab: 41722

Word in vocab: g

Length of embedding: 200



In [23]:
# print a few common words
for i in range(5):
    print(pretrained_words[i])

<pad>
g
x
개
가공식품


## Data preprocessing

In [24]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

train_df = pd.read_csv('data/tarin_prodNm.csv', encoding='euc-kr')
train_df = shuffle(train_df, random_state=33).reset_index(drop=True)
train_df.head()

Unnamed: 0,prodNm,cleaned_prodNm,label
0,"피아토스 감자칩 바베큐맛 85g 피아토스,감자칩,수입감자칩,수입과자",피아토스 감자칩 바베큐맛 g 피아토스 감자칩 수입감자칩 수입과자,0
1,청우 왕사탕 500g/사탕/왕사탕/캔디/왕캔디/간식,청우 왕사탕 g 사탕 왕사탕 캔디 왕캔디 간식,0
2,ACER Swift3 SF314-52G-59WM용 저반사필름,acer swift sf - g- wm용 저반사필름,1
3,"앤디스 크림 데 민트 띤 132g 고급초콜렛,수입초콜렛,수입사탕",앤디스 크림 데 민트 띤 g 고급초콜렛 수입초콜렛 수입사탕,0
4,맥심 오리지널 20T 24입 커피/차/꿀 무료배송,맥심 오리지널 t 입 커피 차 꿀 무료배송,0


In [25]:
cleaned_prodNm = train_df['cleaned_prodNm'].values.tolist()
encoded_labels = train_df['label'].values

In [26]:
# convert prodNm to tokens

def tokenize_all_prodNm(embed_lookup, cleaned_prodNm):
    
    # split each prodNm into a list of words
    prodNm_words = [prodNm.split() for prodNm in cleaned_prodNm]

    tokenized_prodNms = []
    for prodNm in prodNm_words:
        ints = []
        for word in prodNm:
            try:
                idx = embed_lookup.vocab[word].index +1
            except: 
                idx = 0
            ints.append(idx)
            
        tokenized_prodNms.append(ints)
    
    return tokenized_prodNms

tokenize_all_cleaned_prodNms = tokenize_all_prodNm(embed_lookup, cleaned_prodNm)
tokenize_all_cleaned_prodNms[0]

[4029, 480, 977, 1, 4029, 480, 6626, 183]

In [27]:
# testing code and printing a tokenized review
print(tokenize_all_cleaned_prodNms[0])

[4029, 480, 977, 1, 4029, 480, 6626, 183]


In [28]:
# fill padding into toekized_prodNms

def pad_features(tokenize_all_cleaned_prodNms, seq_length):
    ''' Return features of tokenized_prodNms, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(tokenize_all_cleaned_prodNms), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(tokenize_all_cleaned_prodNms):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

features = pad_features(tokenize_all_cleaned_prodNms,15)
features[0]

array([   0,    0,    0,    0,    0,    0,    0, 4029,  480,  977,    1,
       4029,  480, 6626,  183])

## Data loader

In [42]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(11411, 15) 
Validation set: 	(1426, 15) 
Test set: 		(1427, 15)


In [30]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 12

# shuffling and batching data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [31]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


## Define model

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# channel, height, width
class ProdnmCNN(nn.Module):
    """
    The embedding layer + CNN model that will be used to perform sentiment analysis.
    """

    def __init__(self, embed_model, vocab_size, output_size, embedding_dim,
                 num_filters=100, kernel_sizes=[3, 4, 5], freeze_embeddings=True, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(ProdnmCNN, self).__init__()

        # set class vars
        self.num_filters = num_filters
        self.embedding_dim = embedding_dim

        # 1. embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # set weights to pre-trained
        self.embedding.weight = nn.Parameter(torch.from_numpy(embed_model.vectors))  # all vectors

        #         # (optional) freeze embedding weights
        #         if freeze_embeddings:
        #             self.embedding.requires_grad = False

        # 2. convolutional layers
        self.convs_1d = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embedding_dim), padding=(k - 2, 0))
            for k in kernel_sizes])

        # 3. final, fully-connected layer for classification
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, output_size)

        # 4. dropout and sigmoid layers
        self.dropout = nn.Dropout(drop_prob)
        self.sig = nn.Sigmoid()

    def conv_and_pool(self, x, conv):
        """
        Convolutional + max pooling layer
        """
        # squeeze last dim to get size: (batch_size, num_filters, conv_seq_length)
        # conv_seq_length will be ~ 200
        x = F.relu(conv(x)).squeeze(3)

        # 1D pool over conv_seq_length
        # squeeze to get size: (batch_size, num_filters)
        x_max = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x_max

    def forward(self, x):
        """
        Defines how a batch of inputs, x, passes through the model layers.
        Returns a single, sigmoid-activated class score as output.
        """
        # embedded vectors
        embeds = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        # embeds.unsqueeze(1) creates a channel dimension that conv layers expect
        embeds = embeds.unsqueeze(1)

        # get output of each conv-pool layer
        conv_results = [self.conv_and_pool(embeds, conv) for conv in self.convs_1d]

        # concatenate results and add dropout
        x = torch.cat(conv_results, 1)
        x = self.dropout(x)

        # final logit
        logit = self.fc(x)

        # sigmoid-activated --> a class score
        return self.sig(logit)

## Training

In [41]:
# Instantiate the model w/ hyperparams

vocab_size = len(pretrained_words)
output_size = 1 # binary class (1 or 0)
embedding_dim = len(embed_lookup[pretrained_words[1]]) # 300-dim vectors
num_filters = 100
kernel_sizes = [3, 4, 5]

net = ProdnmCNN(embed_lookup, vocab_size, output_size, embedding_dim,
                   num_filters, kernel_sizes)

print(net)

# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

ProdnmCNN(
  (embedding): Embedding(41722, 200)
  (convs_1d): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 200), stride=(1, 1), padding=(1, 0))
    (1): Conv2d(1, 100, kernel_size=(4, 200), stride=(1, 1), padding=(2, 0))
    (2): Conv2d(1, 100, kernel_size=(5, 200), stride=(1, 1), padding=(3, 0))
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (sig): Sigmoid()
)


In [33]:
# training loop
def train(net, train_loader, epochs, print_every=100):

    # move model to GPU, if available
    if(train_on_gpu):
        net.cuda()

    counter = 0 # for printing
    
    # train for some number of epochs
    net.train()
    for e in range(epochs):

        # batch loop
        for inputs, labels in train_loader:
            counter += 1
            
            if(train_on_gpu):
                inputs, labels = inputs.cuda(), labels.cuda()

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output = net(inputs)

            # calculate the loss and perform backprop
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            optimizer.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_losses = []
                num_correct = 0
                
                net.eval()
                
                for inputs, labels in valid_loader:

                    if(train_on_gpu):
                        inputs, labels = inputs.cuda(), labels.cuda()

                    output = net(inputs)
                    val_loss = criterion(output.squeeze(), labels.float())
                    val_losses.append(val_loss.item())
                    
                    ## val acc
                    # convert output probabilities to predicted class (0 or 1)
                    pred = torch.round(output.squeeze())  # rounds to the nearest integer

                    # compare predictions to true label
                    correct_tensor = pred.eq(labels.float().view_as(pred))
                    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
                    num_correct += np.sum(correct)
                    
                net.train()
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)),
                      "Val acc: {:.3f}".format(np.mean(num_correct/len(valid_loader.dataset))))

In [34]:
import time
# training params

start = time.time()

epochs = 20 # this is approx where I noticed the validation loss stop decreasing
print_every = 100

train(net, train_loader, epochs, print_every=print_every)

end = time.time()

print('turnaround time : {0}'.format(end - start))

Epoch: 1/20... Step: 100... Loss: 0.195344... Val Loss: 0.269833 Val acc: 0.889
Epoch: 1/20... Step: 200... Loss: 0.330087... Val Loss: 0.232230 Val acc: 0.904
Epoch: 1/20... Step: 300... Loss: 0.121067... Val Loss: 0.224658 Val acc: 0.894
Epoch: 1/20... Step: 400... Loss: 0.218031... Val Loss: 0.240206 Val acc: 0.901
Epoch: 1/20... Step: 500... Loss: 0.176410... Val Loss: 0.240101 Val acc: 0.906
Epoch: 1/20... Step: 600... Loss: 0.139371... Val Loss: 0.175175 Val acc: 0.924
Epoch: 1/20... Step: 700... Loss: 0.052601... Val Loss: 0.183689 Val acc: 0.914
Epoch: 1/20... Step: 800... Loss: 0.068933... Val Loss: 0.167358 Val acc: 0.935
Epoch: 1/20... Step: 900... Loss: 0.057072... Val Loss: 0.163043 Val acc: 0.938
Epoch: 2/20... Step: 1000... Loss: 0.066954... Val Loss: 0.145152 Val acc: 0.941
Epoch: 2/20... Step: 1100... Loss: 0.122104... Val Loss: 0.166225 Val acc: 0.931
Epoch: 2/20... Step: 1200... Loss: 0.141533... Val Loss: 0.144449 Val acc: 0.939
Epoch: 2/20... Step: 1300... Loss: 0.

In [35]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0


net.eval()
# iterate over test data
for inputs, labels in test_loader:

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output = net(inputs)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.651
Test accuracy: 0.954


## Test inference

In [36]:
title, label = next(iter(test_loader))

In [37]:
# text 복원
def find_title(cleaend_title, df):
    
    cleaned_title = " ".join([i for i in list(map(cleaend_title, word_list)) if '<pad>' not in i])
    
    return cleaned_title

def return_word(idx):
    word = embed_lookup.index2word[idx]
    return word


In [38]:
for i in range(0,133):
    inputs, labels = next(iter(test_loader))

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()

    # get predicted outputs
    output = net(inputs)
    preds = torch.round(output.squeeze())


    word_idx_list = inputs.cpu().numpy().tolist()
    labels = labels.cpu().numpy().tolist()
    preds = preds.cpu().detach().numpy().tolist()

    for num, word_idx in enumerate(word_idx_list):
        cleaned_title_word = list(map(return_word, word_idx))
        cleaned_title = " ".join([i for i in cleaned_title_word if '<pad>' not in i ])

        title = train_df[train_df['cleaned_prodNm'] == cleaned_title]['prodNm'].iloc[0]
        label,pred = labels[num], preds[num]

        if label == 0:
            err = '오류'
        else:
            err= '정상'

        if label == pred:
            correct = '성공'
        else:
            correct = '실패'

        print('제목 : {0}'.format(title))
        print('예측 : {0}, 정답 : {1}, \n오류 유무 : {2}, 예측 결과 : {3}'.format(pred, label, err,correct))
        print('='*100)

제목 : 제주 크런치 3종(감귤,백년초,한라봉)세트 x 5 (사은품증정)
예측 : 0.0, 정답 : 0, 
오류 유무 : 오류, 예측 결과 : 성공
제목 : (GIGA) Aorus X5 V4용고광택필름
예측 : 1.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 성공
제목 : 팔도)불짬뽕 왕컵 115g x 16개
예측 : 1.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 성공
제목 : 부침가루(이츠웰 1K) 식자재 부침가루 기타분말가루
예측 : 1.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 성공
제목 : 하나 진 우골농축액 505 사골농축액 840g 12ea 1box
예측 : 1.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 성공
제목 : 리터스포트 요거트 초콜렛 100g 고급초콜렛/수입초콜렛/수입사탕
예측 : 0.0, 정답 : 0, 
오류 유무 : 오류, 예측 결과 : 성공
제목 : cj햇반 컵반 레드스파이시커리덮밥 1개
예측 : 1.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 성공
제목 : 업소용 위생식탁보(특상)(90x120)280매
예측 : 0.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 실패
제목 : 오리온 왕꿈틀이67g 껌/카라멜/젤리
예측 : 0.0, 정답 : 0, 
오류 유무 : 오류, 예측 결과 : 성공
제목 : 가공식품 선명한필름 S550-P.AF65KN용고광택필름 필름 고광
예측 : 1.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 성공
제목 : CJ 제일제면소 중면 900g 15입
예측 : 0.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 실패
제목 : [무료배송]코모리 후루츠캔디 180g X 3 과일맛캔디,일본인기사탕,수입사탕
예측 : 0.0, 정답 : 0, 
오류 유무 : 오류, 예측 결과 : 성공
제목 : 고바 15U570 필름 15U570용 쌀국수 고광택필름 전용필름 
예측 : 1.0, 정답 : 1, 
오류 유무 : 정상, 예측 결과 : 성공

IndexError: single positional indexer is out-of-bounds