In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# konlpy 
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip3 install konlpy JPype1-py3
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

# gensim 
!pip install gensim

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.utils.data
import konlpy
from konlpy.tag import Mecab
import re
from torchtext import data
from torchtext.vocab import Vectors
from typing import List
import itertools

### 1. 데이터로드

In [None]:
#train data, test data 로드
data_path_train = "/content/gdrive/My Drive/dataset/BalancedNewsCorpus/BalancedNewsCorpus_train.csv"
data_path_test= "/content/gdrive/My Drive/dataset/BalancedNewsCorpus/BalancedNewsCorpus_test.csv"

In [None]:
df_train = pd.read_csv(data_path_train, sep = ',')
df_test = pd.read_csv(data_path_test, sep = ',')

### 2. 전처리 함수 정의

In [None]:
# 불용어 처리 (길호현 (2018). 텍스트마이닝을 위한 한국어 불용어 목록 연구. 우리말글 , 78, 1-25 에 나와있는 불용어 사용. 본 공개코드에선 삭제)
str_stopwords="불용어 입력"
stopwords=str_stopwords.split('\t ')

# tokenizer 함수 정의
mecab=Mecab()
def tokenizer_morphs(string):
    string.replace("…", "")
    string.replace("·", "")
    nouns = mecab.nouns(string) # 명사만 추출
    nouns = [a for a in nouns if (len(a) >1 and a not in stopwords) ]
    return nouns

### 3. Field 정의

In [None]:
from torchtext.data import Field, LabelField, TabularDataset

In [None]:
torch.backends.cudnn.deterministic = True

TEXT = Field(sequential=True, tokenize=tokenizer_morphs, lower=True, include_lengths = True) #Text 처리하는 방법을 정의
LABEL = LabelField(dtype=torch.int64) 

### 4. TabularDataset 생성 

In [None]:
raw_datafields = [("filename", None), # 사용하지 않는 것들은 None
                 ("date", None), ("NewsPaper", None),
                 ("Topic", LABEL), ("News", TEXT)] # Topic을 위에서 정의한 LABEL로, News를 위에서 정의한 TEXT로 처리할 것임 

train_data = TabularDataset(
        path=data_path_train,
        format='csv',
        skip_header=True,
        fields=raw_datafields)

test_data= TabularDataset(
        path=data_path_test,
        format='csv',
        skip_header=True,
        fields=raw_datafields)

### 5. Vocab 생성

In [None]:
import torchtext.vocab as vocab

TEXT.build_vocab(train_data, min_freq=10)
LABEL.build_vocab(train_data)

In [None]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f63ac0b5620>, {'IT/과학': 0, '경제': 1, '문화': 2, '미용/건강': 3, '사회': 4, '생활': 5, '스포츠': 6, '연예': 7, '정치': 8})


### 6. 사전학습 임베딩 불러오기 
Reference: https://rohit-agrawal.medium.com/using-fine-tuned-gensim-word2vec-embeddings-with-torchtext-and-pytorch-17eea2883cd

In [None]:
import gensim
from gensim.models.keyedvectors import KeyedVectors
from tqdm import tqdm_notebook

path = '/content/gdrive/My Drive/dataset/'
Word2Vec_300D_token_model = KeyedVectors.load_word2vec_format(path + 'Word2Vec_300D_token.model', binary=False, encoding='utf-8')

In [None]:
word2vec_vectors = []

for token, idx in tqdm_notebook(TEXT.vocab.stoi.items()):
    if token in Word2Vec_300D_token_model.wv.vocab.keys(): #사전학습 임베딩 모델에 해당 토큰의 임베딩 값이 있을 경우 그 값을 가져옴
        word2vec_vectors.append(torch.FloatTensor(Word2Vec_300D_token_model[token]))
    else:
        word2vec_vectors.append(torch.randn(300)) #사전학습 임베딩 모델에 임베딩 값이 없을 경우 랜덤으로 설정
        
TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, 300) #Vocab 각 토큰의 임베딩 값 설정

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=14639.0), HTML(value='')))

  after removing the cwd from sys.path.
  """





### 7. Iterator 정의

In [None]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=lambda x: len(x.News), #데이터를 그룹화하는데 사용하는 함수
 sort_within_batch=False)

### 8. LSTM Model 정의
Reference: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/2%20-%20Upgraded%20Sentiment%20Analysis.ipynb

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [None]:
#parameters 설정
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 9
N_LAYERS= 2
BIDIRECTIONAL= True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

#모델에 parameters 입력
model = LSTM(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [None]:
# 모델에 사전학습 임베딩값 가져오기

pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-1.3605e+00,  1.6792e+00, -2.4476e-01,  ..., -3.4673e-01,
          1.9876e-01, -4.5431e-01],
        [-4.8776e-03,  9.8222e-01, -8.1195e-01,  ...,  3.8007e-01,
         -9.4967e-01, -4.0274e-01],
        [ 9.8296e-02,  6.8959e-02, -9.7626e-02,  ...,  1.9529e-02,
         -4.3655e-02,  5.1375e-02],
        ...,
        [-2.6780e-02, -4.1724e-02,  1.4282e-01,  ..., -1.1069e-02,
         -1.3842e-02,  5.1257e-02],
        [-5.7113e-02, -1.0315e-01,  1.0424e-01,  ..., -6.7090e-02,
          3.6692e-02,  4.1873e-02],
        [-7.6801e-02, -3.9502e-02,  7.4451e-04,  ...,  4.1259e-02,
          2.8716e-03,  5.0831e-02]])

In [None]:
#<pad> 토큰에 대해 초기 임베딩값을 0으로 설정
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

### 9. Training 

In [None]:
# optimizer와 loss function 정의
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        text, text_lengths = batch.News
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.Topic)
        
        acc = categorical_accuracy(predictions, batch.Topic)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.News
           
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.Topic)
            
            acc = categorical_accuracy(predictions, batch.Topic)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10

best_train_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    #valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if train_loss < best_train_loss:
      best_train_loss = train_loss
      torch.save(model.state_dict(), 'LSTM-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 13s
	Train Loss: 1.803 | Train Acc: 28.87%
Epoch: 02 | Epoch Time: 0m 13s
	Train Loss: 1.061 | Train Acc: 61.91%
Epoch: 03 | Epoch Time: 0m 13s
	Train Loss: 0.655 | Train Acc: 78.18%
Epoch: 04 | Epoch Time: 0m 13s
	Train Loss: 0.423 | Train Acc: 87.08%
Epoch: 05 | Epoch Time: 0m 14s
	Train Loss: 0.289 | Train Acc: 91.33%
Epoch: 06 | Epoch Time: 0m 13s
	Train Loss: 0.198 | Train Acc: 94.19%
Epoch: 07 | Epoch Time: 0m 13s
	Train Loss: 0.136 | Train Acc: 95.92%
Epoch: 08 | Epoch Time: 0m 13s
	Train Loss: 0.095 | Train Acc: 97.25%
Epoch: 09 | Epoch Time: 0m 13s
	Train Loss: 0.094 | Train Acc: 97.21%
Epoch: 10 | Epoch Time: 0m 13s
	Train Loss: 0.059 | Train Acc: 98.39%


### 10. Testing

In [None]:
model.load_state_dict(torch.load('LSTM-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.082 | Test Acc: 78.49%


### 11. User Input

####  뉴스 labels
    -  IT/과학': 0, '경제': 1, '문화': 2, '미용/건강': 3, '사회': 4, '생활': 5, '스포츠': 6, '연예': 7, '정치': 8

In [None]:
def predict_news(model, sentence, min_len=5):
    model.eval()
    tokenized = tokenizer_morphs(sentence)
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length = [len(indexed)]
    length_tensor = torch.LongTensor(length)
    preds = model(tensor, length_tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()