In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# konlpy 
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip3 install konlpy JPype1-py3
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

# gensim 
!pip install gensim

In [None]:
import numpy as np
import pandas as pd
import re
import konlpy
from konlpy.tag import Mecab
from typing import List
import itertools
import torch
import torch.utils.data
from torchtext import data
from torchtext.vocab import Vectors

### 1. 데이터 로드

In [None]:
#train data, test data 로드
data_path_train = "/content/gdrive/My Drive/dataset/BalancedNewsCorpus/BalancedNewsCorpus_train.csv"
data_path_test= "/content/gdrive/My Drive/dataset/BalancedNewsCorpus/BalancedNewsCorpus_test.csv"

In [None]:
df_train = pd.read_csv(data_path_train, sep = ',')
df_test = pd.read_csv(data_path_test, sep = ',')

### 2. 전처리 함수 정의

In [None]:
# 불용어 처리 (길호현 (2018). 텍스트마이닝을 위한 한국어 불용어 목록 연구. 우리말글 , 78, 1-25 에 나와있는 불용어 사용. 본 공개코드에선 삭제)
str_stopwords="불용어 입력"
stopwords=str_stopwords.split('\t ')

# tokenizer 함수 정의
mecab=Mecab()
def tokenizer_morphs(string):
    string.replace("…", "")
    string.replace("·", "")
    nouns = mecab.nouns(string) # 명사만 추출
    nouns = [a for a in nouns if (len(a) >1 and a not in stopwords) ]
    return nouns

# bigram 함수 정의 (ref: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/3%20-%20Faster%20Sentiment%20Analysis.ipynb)
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

### 3. Field 정의

In [None]:
from torchtext.data import Field, LabelField, TabularDataset

In [None]:
torch.backends.cudnn.deterministic = True

TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer_morphs, preprocessing = generate_bigrams)
LABEL = data.LabelField(dtype=torch.int64)

### 4. TabularDataset 생성

In [None]:
raw_datafields = [("filename", None), 
                 ("date", None), ("NewsPaper", None),
                 ("Topic", LABEL), ("News", TEXT)] 

train_data = TabularDataset(
        path=data_path_train,
        format='csv',
        skip_header=True,
        fields=raw_datafields)

test_data= TabularDataset(
        path=data_path_test,
        format='csv',
        skip_header=True,
        fields=raw_datafields)

### 5. Vocab 생성

In [None]:
import torchtext.vocab as vocab

TEXT.build_vocab(train_data, min_freq=10)
LABEL.build_vocab(train_data)

### 6. 사전학습 임베딩 불러오기
Reference: https://rohit-agrawal.medium.com/using-fine-tuned-gensim-word2vec-embeddings-with-torchtext-and-pytorch-17eea2883cd

In [None]:
import gensim
from gensim.models.keyedvectors import KeyedVectors
from tqdm import tqdm_notebook

#형태소 기반 word2vec 임베딩 사용
path = '/content/gdrive/My Drive/model/'
Word2Vec_300D_token_model = KeyedVectors.load_word2vec_format(path + 'Word2Vec_300D_token.model', binary=False, encoding='utf-8')

In [None]:
word2vec_vectors = []

for token, idx in tqdm_notebook(TEXT.vocab.stoi.items()):
    if token in Word2Vec_300D_token_model.wv.vocab.keys(): #사전학습 임베딩 모델에 해당 토큰의 임베딩 값이 있을 경우 그 값을 가져옴
        word2vec_vectors.append(torch.FloatTensor(Word2Vec_300D_token_model[token]))
    else:
        word2vec_vectors.append(torch.randn(300)) #사전학습 임베딩 모델에 임베딩 값이 없을 경우 랜덤으로 설정
        
TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, 300) #Vocab 각 토큰의 임베딩 값 설정

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=22873.0), HTML(value='')))

  after removing the cwd from sys.path.
  """





### 7. Iterator 정의

In [None]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=lambda x: len(x.News), #데이터를 그룹화하는데 사용하는 함수
    sort_within_batch=False)

### 8. FastText Model 정의
Reference: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/3%20-%20Faster%20Sentiment%20Analysis.ipynb

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)

In [None]:
#parameters 설정
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
OUTPUT_DIM = len(LABEL.vocab)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

#모델에 parameters 입력
model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [None]:
#parameter 수 확인
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,864,609 trainable parameters


In [None]:
#모델에 사전학습 임베딩값 가져오기
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-1.6938e+00,  9.5898e-01, -5.5622e-01,  ...,  8.2150e-01,
         -1.9667e-01, -9.8671e-02],
        [ 7.3189e-01, -1.1772e-01, -1.2516e+00,  ..., -1.8687e-02,
         -1.1934e+00,  2.6147e-01],
        [ 9.8296e-02,  6.8959e-02, -9.7626e-02,  ...,  1.9529e-02,
         -4.3655e-02,  5.1375e-02],
        ...,
        [-5.7113e-02, -1.0315e-01,  1.0424e-01,  ..., -6.7090e-02,
          3.6692e-02,  4.1873e-02],
        [-7.6801e-02, -3.9502e-02,  7.4451e-04,  ...,  4.1259e-02,
          2.8716e-03,  5.0831e-02],
        [-1.2414e+00, -1.6570e-01, -1.3894e-01,  ..., -9.1861e-02,
         -5.1316e-01,  1.0526e+00]])

In [None]:
#<pad> 토큰에 대해 초기 임베딩값을 0으로 설정 
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

### 9. Training

In [None]:
#optimizer와 loss function 정의
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.News)
        
        loss = criterion(predictions, batch.Topic)
        
        acc = categorical_accuracy(predictions, batch.Topic)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.News)
            
            loss = criterion(predictions, batch.Topic)
            
            acc = categorical_accuracy(predictions, batch.Topic)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 40

best_train_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if train_loss < best_train_loss:
        best_train_loss = train_loss
    torch.save(model.state_dict(), 'tut5-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 2.172 | Train Acc: 14.99%
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 2.099 | Train Acc: 32.08%
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 1.954 | Train Acc: 53.40%
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 1.729 | Train Acc: 61.29%
Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 1.464 | Train Acc: 70.16%
Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 1.221 | Train Acc: 75.69%
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 1.036 | Train Acc: 79.32%
Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 0.890 | Train Acc: 80.90%
Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 0.780 | Train Acc: 82.87%
Epoch: 10 | Epoch Time: 0m 1s
	Train Loss: 0.690 | Train Acc: 84.67%
Epoch: 11 | Epoch Time: 0m 1s
	Train Loss: 0.613 | Train Acc: 85.73%
Epoch: 12 | Epoch Time: 0m 1s
	Train Loss: 0.555 | Train Acc: 87.09%
Epoch: 13 | Epoch Time: 0m 1s
	Train Loss: 0.500 | Train Acc: 88.33%
Epoch: 14 | Epoch Time: 0m 1s
	Train Loss: 0.454 | Train Acc: 89.30%
Epoch: 15 | Epoch Time: 0m 1s
	Tra

### 10. Testing

In [None]:
model.load_state_dict(torch.load('tut5-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.457 | Test Acc: 84.58%




### 11. User Input

####  뉴스 labels
    -  IT/과학': 0, '경제': 1, '문화': 2, '미용/건강': 3, '사회': 4, '생활': 5, '스포츠': 6, '연예': 7, '정치': 8

In [None]:
def predict_news(model, sentence, min_len=5):
    model.eval()
    tokenized = tokenizer_morphs(sentence)
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()