# 18. Word2Vec

In [1]:
import pandas as pd

from konlpy.tag import Twitter
from gensim.models import word2vec

import torch
import torch.nn as nn
import torch.utils.data as Data

import numpy as np
import random

from sklearn.model_selection import train_test_split



## 18.1 Prepare Data

In [2]:
filename = "data/score_발열.xlsx"
sheet_name = "Sheet1"
data = pd.read_excel(filename, sheet_name = sheet_name, header = 0)

csv_data = [item.replace("#", "").strip() for item in data['Review']]
csv_label = data['Score']

In [3]:
csv_data[:5]

['발열히 심한거 같은데 여름이라 그런가?..',
 '발열이좀 심한거 같아서 걱정이에요',
 '발열이심하더라구요',
 '발열이너무심한게 제일큰 단점인것 같고 그외에 불편한점은',
 '발열이...정말...심합니다']

## 18.2 Word2Vec

In [4]:
twitter = Twitter()
size = 500

doc = []

for sentence in csv_data :
    results= []
    tokens = twitter.pos(sentence, norm=True, stem=True)
        
    for token in tokens:
        if not token[1] in ["Josa", "Eomi", "Punctuation"]:
            results.append(token[0])
    doc.append(results)

In [5]:
doc[:5]

[['발열', '히', '심하다', '같다', '여름', '그', '런가'],
 ['발열', '이', '좀', '심하다', '같다', '걱정'],
 ['발열', '심하다'],
 ['발열', '이', '너무', '심하다', '제일', '크다', '단점', '것', '같다', '그', '외', '불편하다', '점'],
 ['발열', '정말', '심하다']]

In [6]:
model = word2vec.Word2Vec(doc, size=size, window=2, hs=0, min_count=3, sg=0)

w2v = dict(zip(model.wv.index2word, model.wv.vectors))

del model

# sentences (iterable of iterables, optional) – The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See BrownCorpus, Text8Corpus or LineSentence in word2vec module for such examples. See also the tutorial on data streaming in Python. If you don’t supply sentences, the model is left uninitialized – use if you plan to initialize it in some other way.
# size (int, optional) – Dimensionality of the word vectors.
# window (int, optional) – Maximum distance between the current and predicted word within a sentence.
# min_count (int, optional) – Ignores all words with total frequency lower than this.
# workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
# sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
# hs ({0, 1}, optional) – If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
# negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.

In [7]:
w2v.keys()

dict_keys(['발열', '없다', '하다', '있다', '소음', '심하다', '좋다', '같다', '않다', '거의', '좀', '적다', '자다', '것', '잡다', '되다', '만족', '문제', '조금', '정도', '생각', '괜찮다', '못', '사용', '노트북', '이', '너무', '느끼다', '이다', '부분', '팬', '성능', '쿨러', '정말', '크다', '돌아가다', '잘', '제품', '걱정', '안', '배터리', '별로', '도', '더', '전혀', '속도', '게임', '많이', '쓰다', '매우', '쓸다', '아주', '가볍다', '소리', '어쩌다', '아직', '느껴지다', '보다', '적', '다', '그렇다', '때문', '나다', '많다', '키', '편이', '관리', '아니다', '보이다', '신경', '크게', '조용하다', '약간', '때', '오래', '및', '네', '그', '상당하다', '수', '들다', '없이', '거', '꽤', '보드', '시간', '양호', '쿨링', '무게', '아쉽다', '디자인', '제어', '점', '심해', '높다', '빠르다', '비', '또한', '모두', '요', '빼다', '돌리다', '모르다', '발생', '삼성', '다른', '감', '맘', '감다', '잘되다', '잡히다', '확실하다', '진짜', '하지만', '들', '해보다', '가격', '편', '모델', '안나', '가다', '개선', '단점', '느낌', '제', '뜨겁다', '펜', '하나', '시', '듯', '사은', '수준', '최고', '만족스럽다', '10', '2', '지다', '구매', '습', '되어다', '메탈', '약하다', '상태', '충전', '굉장하다', '한', '뜨다', '훨씬', '품', '작업', '화면', '발', '열량', '아', '그렇게', '장시간', '사', '생기다', '중', '무엇', '3', '인', '걸리다', '받침', '대',

In [8]:
# Document to 3-dim Matrix with Word2Vector & Get Max Length of Sentence

doc2vec = []
max_length = 0

for sentence in doc :
    temp = []
    length = 0
    
    for word in sentence :
        if word in w2v.keys() :
            temp.append(w2v[word])
            length += 1
            
    doc2vec.append(temp)
    
    if max_length <= length :
        max_length = length

In [9]:
# Fill zeros for fitting size

for sentence in doc2vec :
    
    length = len(sentence)
    
    while length < max_length :
        sentence.append(np.zeros(size))
        length += 1

doc2vec = np.array(doc2vec)

In [10]:
doc2vec.shape

(1211, 12, 500)

In [11]:
data = doc2vec
label = csv_label.values

## 18.3 Train-Test Split

In [12]:
train_data, test_data, train_label, test_label = train_test_split(data, label)

print(len(train_data))
print(len(test_data))

908
303


In [13]:
x = torch.from_numpy(train_data).type(torch.FloatTensor)
y = torch.from_numpy(train_label).type(torch.LongTensor)

In [14]:
y.size() , x.size()

(torch.Size([908]), torch.Size([908, 12, 500]))

In [15]:
x = x.view(-1, 1, 12, 500)

In [16]:
train_data = Data.TensorDataset(x, y)

batch_size = 10

train_loader  = Data.DataLoader(dataset=train_data,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          num_workers=1,
                                          drop_last = True)

In [17]:
text, label = iter(train_loader).next()
text.shape

torch.Size([10, 1, 12, 500])

## 18.4 Define Model for Sentiment Analysis

In [18]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        
        self.layer = nn.Sequential(
            nn.Conv2d(1,16,3), #1*12*500 -> 16*10*498
            nn.ReLU(),
            nn.Conv2d(16,32,3), #16*10*498 -> 32*8*496
            nn.ReLU(),
            nn.MaxPool2d(2,2), #32*8*496 -> 32*4*248
            nn.Conv2d(32,64,3),#32*4*248 -> 64*2*246
            nn.ReLU(),
            nn.MaxPool2d(2,2) #64*2*246 -> 64*1*123
        )
        
        self.fc_layer = nn.Sequential(
            nn.Linear(64*1*123,100),
            nn.ReLU(),
            nn.Linear(100,3)
        )       
        
    def forward(self,x):
        out = self.layer(x)
        out = out.view(-1,64*1*123)
        out = self.fc_layer(out)

        return out

In [19]:
model = CNN().cuda()

## 18.5 Train Model

In [20]:
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [21]:
num_epochs = 50

In [22]:
for epoch in range(num_epochs):

    total_batch = len(train_data) // batch_size
    
    for i, (batch_text, batch_labels) in enumerate(train_loader):
        
        X = batch_text.cuda()
        Y = batch_labels.cuda()
        
        pre = model(X)
        cost = loss(pre, Y)
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        if (i+1) % 20 == 0:
            print('Epoch [%d/%d], lter [%d/%d] Loss: %.4f'
                 %(epoch+1, num_epochs, i+1, total_batch, cost.item()))
    
print("Learning Finished!")

Epoch [1/50], lter [20/90] Loss: 1.0605
Epoch [1/50], lter [40/90] Loss: 0.6750
Epoch [1/50], lter [60/90] Loss: 1.1624
Epoch [1/50], lter [80/90] Loss: 1.0143
Epoch [2/50], lter [20/90] Loss: 1.1285
Epoch [2/50], lter [40/90] Loss: 0.8420
Epoch [2/50], lter [60/90] Loss: 1.0301
Epoch [2/50], lter [80/90] Loss: 0.6638
Epoch [3/50], lter [20/90] Loss: 0.8741
Epoch [3/50], lter [40/90] Loss: 1.0172
Epoch [3/50], lter [60/90] Loss: 0.7852
Epoch [3/50], lter [80/90] Loss: 0.9635
Epoch [4/50], lter [20/90] Loss: 0.8498
Epoch [4/50], lter [40/90] Loss: 0.9091
Epoch [4/50], lter [60/90] Loss: 0.8942
Epoch [4/50], lter [80/90] Loss: 1.1714
Epoch [5/50], lter [20/90] Loss: 0.7040
Epoch [5/50], lter [40/90] Loss: 0.7909
Epoch [5/50], lter [60/90] Loss: 0.9513
Epoch [5/50], lter [80/90] Loss: 0.7758
Epoch [6/50], lter [20/90] Loss: 1.0177
Epoch [6/50], lter [40/90] Loss: 0.7344
Epoch [6/50], lter [60/90] Loss: 0.8030
Epoch [6/50], lter [80/90] Loss: 0.7832
Epoch [7/50], lter [20/90] Loss: 0.9739


## 18.6 Test Model

In [23]:
x_test = torch.from_numpy(test_data).type(torch.FloatTensor)
y_test = torch.from_numpy(test_label).type(torch.LongTensor)

test_data = Data.TensorDataset(x, y)

test_loader  = Data.DataLoader(dataset=test_data,
                                          batch_size=1,
                                          shuffle=True)

In [24]:
model.eval()

correct = 0
total = 0

for text, labels in test_loader:
    
    text = text.cuda()
    outputs = model(text)
    
    _, pre = torch.max(outputs.data, 1)
    total += 1
    correct += (pre == labels.cuda()).sum()
    
print('Accuracy of test text: %f %%' % (100 * float(correct) / total))

Accuracy of test text: 58.590308 %
