# 18. Word2Vec

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from gensim.models import word2vec

from konlpy.tag import Okt

import random
import codecs
import pandas as pd
import numpy as np

## 18.1 Prepare Data

In [2]:
filename = "data/score_발열.xlsx"
sheet_name = "Sheet1"
data = pd.read_excel(filename, sheet_name = sheet_name, header = 0)

csv_data = [item.replace("#", "").strip() for item in data['Review']]
csv_label = data['Score']

In [3]:
csv_data[:5]

['발열히 심한거 같은데 여름이라 그런가?..',
 '발열이좀 심한거 같아서 걱정이에요',
 '발열이심하더라구요',
 '발열이너무심한게 제일큰 단점인것 같고 그외에 불편한점은',
 '발열이...정말...심합니다']

## 18.2 Word2Vec

In [4]:
tokenizer = Okt()
size = 500

doc = []

for sentence in csv_data :
    results= []
    tokens = tokenizer.pos(sentence, norm=True, stem=True)
        
    for token in tokens:
        if not token[1] in ["Josa", "Eomi", "Punctuation"]:
            results.append(token[0])
    doc.append(results)

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [5]:
doc[:5]

[['발열', '히', '심하다', '같다', '여름', '그', '런가'],
 ['발열', '이', '좀', '심하다', '같다', '걱정'],
 ['발열', '심하다'],
 ['발열', '이', '너', '무심하다', '제일', '크다', '단점', '것', '같다', '그', '외', '불편하다', '점'],
 ['발열', '정말', '심하다']]

In [6]:
model = word2vec.Word2Vec(doc, size=size, window=2, min_count=3, sg=0)

w2v = dict(zip(model.wv.index2word, model.wv.vectors))

del model

# sentences (iterable of iterables, optional) – The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See BrownCorpus, Text8Corpus or LineSentence in word2vec module for such examples. See also the tutorial on data streaming in Python. If you don’t supply sentences, the model is left uninitialized – use if you plan to initialize it in some other way.
# size (int, optional) – Dimensionality of the word vectors.
# window (int, optional) – Maximum distance between the current and predicted word within a sentence.
# min_count (int, optional) – Ignores all words with total frequency lower than this.
# workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
# sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
# hs ({0, 1}, optional) – If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
# negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.

# Loading pretrained Word2Vec
# Google's trained Word2Vec : https://code.google.com/archive/p/word2vec/
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

# import gensim
# import gensim.models.keyedvectors as word2vec

# model = word2vec.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)
# weights = torch.FloatTensor(model.vectors)
# embedding = nn.Embedding.from_pretrained(weights)

In [7]:
# Document to 3-dim Matrix with Word2Vector & Get Max Length of Sentence

doc2vec = []
max_length = 0

for sentence in doc :
    temp = []
    length = 0
    
    for word in sentence :
        if word in w2v.keys() :
            temp.append(w2v[word])
            length += 1
            
    doc2vec.append(temp)
    
    if max_length <= length :
        max_length = length

In [8]:
# Fill zeros for fitting size

for sentence in doc2vec :
    
    length = len(sentence)
    
    while length < max_length :
        sentence.append(np.zeros(size))
        length += 1

doc2vec = np.array(doc2vec)

In [9]:
doc2vec.shape

(1211, 12, 500)

In [10]:
data = doc2vec
label = csv_label.values

## 18.3 Train-Test Split

In [11]:
train_data, test_data, train_label, test_label = train_test_split(data, label)

print(len(train_data))
print(len(test_data))

908
303


In [12]:
x = torch.from_numpy(train_data).type(torch.FloatTensor)
y = torch.from_numpy(train_label).type(torch.LongTensor)

In [13]:
y.size() , x.size()

(torch.Size([908]), torch.Size([908, 12, 500]))

In [14]:
x = x.view(-1, 1, 12, 500)

In [15]:
train_data = TensorDataset(x, y)

batch_size = 10

train_loader  = DataLoader(dataset=train_data,
                           batch_size=batch_size,
                           shuffle=True,
                           num_workers=1,
                           drop_last = True)

In [16]:
text, label = iter(train_loader).next()
text.shape

torch.Size([10, 1, 12, 500])

## 18.4 Define Model for Sentiment Analysis

In [17]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        
        self.layer = nn.Sequential(
            nn.Conv2d(1,16,3), #1*12*500 -> 16*10*498
            nn.ReLU(),
            nn.Conv2d(16,32,3), #16*10*498 -> 32*8*496
            nn.ReLU(),
            nn.MaxPool2d(2,2), #32*8*496 -> 32*4*248
            nn.Conv2d(32,64,3),#32*4*248 -> 64*2*246
            nn.ReLU(),
            nn.MaxPool2d(2,2) #64*2*246 -> 64*1*123
        )
        
        self.fc_layer = nn.Sequential(
            nn.Linear(64*1*123,100),
            nn.ReLU(),
            nn.Linear(100,3)
        )       
        
    def forward(self,x):
        out = self.layer(x)
        out = out.view(-1,64*1*123)
        out = self.fc_layer(out)

        return out

In [18]:
model = CNN().cuda()

## 18.5 Train Model

In [19]:
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [20]:
num_epochs = 50

In [21]:
for epoch in range(num_epochs):

    total_batch = len(train_data) // batch_size
    
    for i, (batch_text, batch_labels) in enumerate(train_loader):
        
        X = batch_text.cuda()
        Y = batch_labels.cuda()
        
        pre = model(X)
        cost = loss(pre, Y)
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        if (i+1) % 20 == 0:
            print('Epoch [%d/%d], lter [%d/%d], Loss: %.4f'
                 %(epoch+1, num_epochs, i+1, total_batch, cost.item()))
    
print("Learning Finished!")

Epoch [1/50], lter [20/90], Loss: 0.7399
Epoch [1/50], lter [40/90], Loss: 0.9133
Epoch [1/50], lter [60/90], Loss: 0.9621
Epoch [1/50], lter [80/90], Loss: 1.1947
Epoch [2/50], lter [20/90], Loss: 1.1412
Epoch [2/50], lter [40/90], Loss: 1.1714
Epoch [2/50], lter [60/90], Loss: 0.9011
Epoch [2/50], lter [80/90], Loss: 0.4859
Epoch [3/50], lter [20/90], Loss: 1.0252
Epoch [3/50], lter [40/90], Loss: 0.8421
Epoch [3/50], lter [60/90], Loss: 0.8277
Epoch [3/50], lter [80/90], Loss: 0.9643
Epoch [4/50], lter [20/90], Loss: 0.9501
Epoch [4/50], lter [40/90], Loss: 1.1209
Epoch [4/50], lter [60/90], Loss: 0.9303
Epoch [4/50], lter [80/90], Loss: 0.9678
Epoch [5/50], lter [20/90], Loss: 0.7490
Epoch [5/50], lter [40/90], Loss: 0.9057
Epoch [5/50], lter [60/90], Loss: 0.6642
Epoch [5/50], lter [80/90], Loss: 1.0030
Epoch [6/50], lter [20/90], Loss: 1.2429
Epoch [6/50], lter [40/90], Loss: 1.1634
Epoch [6/50], lter [60/90], Loss: 0.8501
Epoch [6/50], lter [80/90], Loss: 0.8352
Epoch [7/50], lt

Epoch [50/50], lter [20/90], Loss: 0.9012
Epoch [50/50], lter [40/90], Loss: 1.2143
Epoch [50/50], lter [60/90], Loss: 1.1420
Epoch [50/50], lter [80/90], Loss: 0.9010
Learning Finished!


## 18.6 Test Model

In [22]:
x_test = torch.from_numpy(test_data).type(torch.FloatTensor)
x_test = x_test.view(-1, 1, 12, 500)

y_test = torch.from_numpy(test_label).type(torch.LongTensor)

test_data = TensorDataset(x, y)

test_loader  = DataLoader(dataset=test_data,
                          batch_size=1,
                          shuffle=True)

In [23]:
model.eval()

correct = 0
total = 0

for text, labels in test_loader:
    
    text = text.cuda()
    outputs = model(text)
    
    _, pre = torch.max(outputs.data, 1)
    total += 1
    correct += (pre == labels.cuda()).sum()
    
print('Accuracy of test text: %f %%' % (100 * float(correct) / total))

Accuracy of test text: 58.590308 %
