In [1]:
import pandas as pd
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
import pickle

with open("token_data.pickle", "rb") as f:
    token_data = pickle.load(f)


In [16]:
comment = token_data["comment"]
target = token_data["target"]
reviews_idx = token_data["comment_ix"]
word2idx = token_data["word2ix"]
idx2word = token_data["ix2word"]
max_seq_length = token_data["max_seq_length"]
idx2target={1:'highlight',0:'no'}

In [5]:
def padding(token):
    token_post=token.copy()
    len_sentence=[ len(i) for i in token_post]
    max_sentence=max(len_sentence) #최대 문장길이 9591
    n=-1
    for i in token_post:
        n+=1
        k=len(token_post[n])
        while max_sentence-k>=0:
            token_post[n].append('<PAD>')
            k+=1
    return token_post

In [6]:
comment=list(map(lambda x: padding(x),  [comment]))
comment=comment[0]

In [7]:
li1=[]
comment_list=[]
for i in comment:
    for k in i:
        li1.append(word2idx[k])
    
    comment_list.append(li1)
    li1=[]

In [8]:
X_train,X_test,y_train,y_test=train_test_split(comment_list,target,test_size=0.3,random_state=7)

In [18]:
class RNN_Clf(nn.Module):
    def __init__(self, input_size, embedding_size,num_class,num_layers,hidden_size,bidirectional,num_directions):
        
        super(RNN_Clf,self).__init__()
        
        # 안에 하이퍼 파라미터의 종류들 선언
        
        self.num_layers=num_layers
        self.hidden_size=hidden_size
        self.input_size=input_size
        self.embedding_size=embedding_size
        self.num_class=num_class
        self.num_directions=num_directions
        
        self.embedding=nn.Embedding(input_size,
                                    embedding_size,padding_idx=word2idx['<PAD>'])
        
        self.rnn=nn.RNN(input_size=embedding_size,
                        hidden_size=hidden_size,
                        num_layers=num_layers,
                        batch_first=True,
                        bidirectional=bidirectional,) # 해당 sequence의 앞 뒤를 모두 고려해서 반영
        
        self.linear=nn.Linear(in_features=hidden_size*num_directions,out_features=num_class)
   

    
    def forward(self,inputs):
        
        hidden=self.init_hidden(batch_size)
        
        embed=self.embedding(inputs)
        
        out, _ =self.rnn(embed,hidden)
        
        return self.linear(out[:, -1:, :].squeeze(1))
    
    
    def predict(self,inputs):
        
        hidden = self.init_hidden(100)
        embed = self.embedding(inputs)

        # Propagate embedding through RNN
        # Input: (batch, seq_len, embedding_size)
        # hidden: (num_layers * num_directions, batch, hidden_size)
        
        out, _ = self.rnn(embed, hidden)

        return self.linear(out[:, -1:, :].squeeze(1))
    
    def init_hidden(self, batch_size):
        # Initialize hidden and cell states
        # (num_layers * num_directions, batch, hidden_size)
        return Variable(torch.zeros(self.num_layers*self.num_directions,batch_size, self.hidden_size))
    
    

In [10]:
batch_size=len(X_train)
batch_size
#bidirectional=False

31867

In [11]:
len(word2idx)

17786

In [12]:
max_seq_length

78

In [13]:
len(X_train[0])

79

In [14]:
model = RNN_Clf(input_size=len(word2idx),num_directions=1, embedding_size = 30,hidden_size=len(X_train[0]),
                num_layers=2,num_class=2,bidirectional=False).to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=0.1,momentum=0.9)

for epoch in range(5):

    model.zero_grad()
    inputs = Variable(torch.LongTensor(X_train)).to(device)
    targets = Variable(torch.LongTensor(y_train)).to(device)
    
    preds = model(inputs)
    
    loss = loss_function(preds, targets)

    if epoch % 1 == 0:
        print(loss.item())
    
    loss.backward()
    optimizer.step()

0.6492030024528503
0.5970116853713989
0.5409678816795349
0.5326482057571411
0.5661810636520386


In [20]:
list_predicted=[]
correct = 0
for i, seq in enumerate(X_test):
    inputs = Variable(torch.LongTensor(seq).view(1,-1)).to(device)
    pred = model.predict(inputs)
    _, pred = torch.max(pred, 1)
    list_predicted.append(pred)
    true = y_test[i]
    if true == pred.item():
        correct +=1
    
    if i%10000 == 0:
        input_seq = [idx2word[ix] for ix in seq if ix != word2idx['<PAD>']]
        print("Input :", input_seq)
        print("Prediction :", idx2target[pred.item()])
        print("Truth :", idx2target[y_test[i]])
        print("\n")

print("Accuracy :", (correct/len(X_test)*100))

Input : ['킹/Noun', '리/Noun', '아/Exclamation', '나/Noun']
Prediction : no
Truth : no


Input : ['쉔무/Noun', '새/Noun']
Prediction : no
Truth : no


Accuracy : 77.55161809928246


In [35]:
from sklearn.metrics import f1_score,accuracy_score

In [26]:
f1_score(y_test,list_predicted)

  'precision', 'predicted', average, warn_for)


0.0

In [27]:
from sklearn.naive_bayes import GaussianNB

In [28]:
clf_nb=GaussianNB()

In [38]:
clf_nb.fit(X_train,y_train)
y_pred=clf_nb.predict(X_test)

print('Accuracy score: %s' %accuracy_score(y_test,y_pred))
print('F1 score: %s' %f1_score(y_test,y_pred))


Accuracy score: 0.23180553521745498
F1 score: 0.36848441073793187
