# RNN 

- Data : NSMC 네이버 영화 리뷰 데이터셋
- Model : RNN, LSTM, GRU
- Task : 긍정/부정 분류하기 (Classification)
- Loss : BCE loss

## 1. Data preprocessing functions

In [3]:
!pip install git+https://github.com/ssut/py-hanspell.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/ssut/py-hanspell.git
  Cloning https://github.com/ssut/py-hanspell.git to /tmp/pip-req-build-hotc78p0
  Running command git clone -q https://github.com/ssut/py-hanspell.git /tmp/pip-req-build-hotc78p0
Building wheels for collected packages: py-hanspell
  Building wheel for py-hanspell (setup.py) ... [?25l[?25hdone
  Created wheel for py-hanspell: filename=py_hanspell-1.1-py3-none-any.whl size=4868 sha256=4913cdc5438dc09b57f5fc121061a75d9e1ee1d7152f81bcf11ec8af33edce83
  Stored in directory: /tmp/pip-ephem-wheel-cache-wy8cmh5r/wheels/ab/f5/7b/d4124bb329c905301baed80e2ae45aa14e824f62ebc3ec2cc4
Successfully built py-hanspell
Installing collected packages: py-hanspell
Successfully installed py-hanspell-1.1


In [4]:
from hanspell import spell_checker
#from pykospacing import Spacing

# sort, unique -> 리눅스 창에서 하는 게 나음
# 특수문자는 감정과 관련될 수 있어서 제거하지 않음

# basic 전처리
def basic_preprocess(sentence):
    # 기호 일반화
    punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
    for p in punct_mapping:
        sentence = sentence.replace(p, punct_mapping[p])
    new_sentence = sentence.lower() # 대소문자 통일
    new_sentence = new_sentence.strip() # 좌우 공백 제거
    return new_sentence

# 맞춤법 교정
def spell_check(sentence):
    result = spell_checker.check(sentence)
    new_sentence = result.as_dict()['checked']
    return new_sentence

# 띄어쓰기 교정
#def spacing_check(sentence):
#    spacing = Spacing()
#    new_sentence = spacing(sentence) 
#    return new_sentence

# 이상한 문자 제거


# 전체 전처리 함수
def preprocess(sentence):
    # 기본 전처리
    new_sentence = basic_preprocess(sentence)

    # 맞춤법
    #new_sentence = spell_check(new_sentence)

    # 띄어쓰기
    #new_sentence = spacing_check(new_sentence)

    return new_sentence

In [5]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 2. Prepare Dataset

### install

In [6]:
!python3 -m pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.1 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 52.0 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.0 konlpy-0.6.0


In [7]:
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Installing automake (A dependency for mecab-ko)
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [85.6 kB]
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [824 kB]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:10 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:13 htt

### code

In [8]:
import torch
import numpy as np
from torch.utils.data import Dataset
from konlpy.tag import Mecab
from collections import Counter
from itertools import chain
from functools import partial
from tqdm import tqdm
import pickle

#from preprocessing import basic_preprocess, preprocess
#def tokenize(tokenizer):

class Vocab():
    PAD_TOKEN = '<PAD>'
    PAD_TOKEN_ID = 0
    UNK_TOKEN = '<UNK>'
    UNK_TOKEN_ID = 1
    SOS_TOKEN = '<SOS>'
    SOS_TOKEN_ID = 2
    EOS_TOKEN = '<EOS>'
    EOS_TOKEN_ID = 3

    def __init__(self, tokenizer=Mecab, do_preprocess=True) -> None:
        self.tokenizer = tokenizer
        
        data_path = "./nsmc/ratings_train.txt" 
        train_f = open(data_path, 'r', encoding="utf-8")
        lines = train_f.readlines()
        
        self.train_corpus, self.train_labels = [], []
        for i in tqdm(range(len(lines)), desc="get_train_data"):
            line = lines[i]
            if i==0:
                continue
            _, sentence, label = line.split(sep='\t')
            
            # preprocessing
            label = label[0]
            sentence = self._preprocess(sentence, do_preprocess)
            if sentence:
                self.train_corpus.append(sentence)
                self.train_labels.append(int(label))


        self.test_corpus, self.test_labels = [], []
        test_data_path = "./nsmc/ratings_test.txt" 
        test_f = open(test_data_path, 'r', encoding="utf-8")
        lines = test_f.readlines()
        for i in tqdm(range(len(lines)), desc="get_test_data"):
            line = lines[i]
            if i==0:
                continue
            _, sentence, label = line.split(sep='\t')
            # preprocessing
            label = label[0]
            sentence = self._preprocess(sentence, do_preprocess)
            if sentence:
                self.test_corpus.append(sentence)
                self.test_labels.append(int(label))
        
        # test corpus도 포함해서 Vocab을 구성하는 게 맞겠죠?
        self.train_corpus = list(map(self._tokenize_sent,self.train_corpus))
        self.test_corpus = list(map(self._tokenize_sent,self.test_corpus))

        self.id2token, self.token2id = self._build_vocab(self.train_corpus + self.test_corpus, min_freq=2)
        self.vocab_len = len(self.token2id)
        
        # save
        self.obj_to_save = [self.train_corpus, self.train_labels, self.test_corpus, self.test_labels, self.token2id, self.id2token]
        with open('./vocab.pkl', 'wb') as f:
            for obj in self.obj_to_save:
                pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    
    def _preprocess(self, text, do_preprocess):
        if do_preprocess:
            new_text = preprocess(text)
        else:
            new_text = basic_preprocess(text)
        return new_text

    def _tokenize_sent(self, sentence):
        tokens = self.tokenizer.morphs(sentence)
        return tokens

    def _build_vocab(self, tokens, min_freq=1):
        SPECIAL_TOKENS= [self.PAD_TOKEN, self.UNK_TOKEN, self.SOS_TOKEN, self.EOS_TOKEN]
        id2token = SPECIAL_TOKENS + [word for word, count in Counter(chain(*tokens)).items() if count >= min_freq]
        token2id = {word: idx for idx, word in enumerate(id2token)}

        assert id2token[self.UNK_TOKEN_ID] == self.UNK_TOKEN and token2id[self.UNK_TOKEN] == self.UNK_TOKEN_ID, \
            "[UNK] 토큰을 적절히 삽입하세요"
        assert len(id2token) == len(token2id), \
            "id2word과 word2id의 크기는 같아야 합니다"
        return id2token, token2id

class NSMCDataset(Dataset):
    def __init__(self, corpus, labels, token2id, id2token, tokenizer=Mecab()):
        super(NSMCDataset, self).__init__()
        print(len(token2id))
        self.corpus = corpus
        self.labels = labels
        self.token2id = token2id
        self.id2token = id2token
        self.tokenizer = tokenizer
        

        # test corpus도 포함해서 Vocab을 구성하는 게 맞겠죠?
        self.vocab_len = len(self.token2id)
        self._encoding_corpus()
        self.corpus = self.input_ids
        #self.corpus = list(map(self._one_hot_encoding, self.input_ids))


    def __getitem__(self, index):
        return self.corpus[index], self.labels[index]

    def __len__(self):
        return len(self.corpus)

    def _encoding_corpus(self):
        self.input_ids = list(map(partial(self._encode_sent, token2id=self.token2id), self.corpus))

    def _encode_sent(self, sentence, token2id):
        UNK_TOKEN_ID = 1
        token_ids = torch.tensor([token2id.get(token, UNK_TOKEN_ID) for token in sentence])
        return token_ids






## 3. Model

In [22]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNNClassification(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        emb_size, 
        hidden_size=128, 
        num_layers=4):

        super(RNNClassification, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.rnn = nn.RNN(
            input_size=emb_size, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            batch_first=True, 
            bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)
        self.sigmoid = nn.Sigmoid()
        self._initialize_weights()
    
    def _initialize_weights(self):
        nn.init.xavier_uniform_(self.fc.weight)


    def forward(self, x, lens):
        #print(x)
        outs = self.emb(x)
        print("emb:", outs.size())
        outs_packed = pack_padded_sequence(outs, lens, batch_first=True, enforce_sorted=False)
        y, hidden = self.rnn(outs_packed)
        y, lens_unpacked = pad_packed_sequence(y, batch_first=True)
        print("rnn:", y.size())
        #y_last = y[:,-1]
        y = torch.stack([y[i,l-1, :] for i,l in zip(range(len(y)),lens)], dim=0)
        y = self.sigmoid(self.fc(y))
        return y

class LSTMClassification(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        emb_size, 
        hidden_size=128, 
        num_layers=4):

        super(RNNClassification, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(
            input_size=emb_size, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            batch_first=True, 
            bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)
        self.sigmoid = nn.Sigmoid()
        self._initialize_weights()
    
    def _initialize_weights(self):
        nn.init.xavier_uniform_(self.fc.weight)


    def forward(self, x, lens):
        #print(x)
        outs = self.emb(x)
        print("emb:", outs.size())
        outs_packed = pack_padded_sequence(outs, lens, batch_first=True, enforce_sorted=False)
        y, (hidden, cell) = self.lstm(outs_packed)
        y, lens_unpacked = pad_packed_sequence(y, batch_first=True)
        print("rnn:", y.size())
        #y_last = y[:,-1]
        y = torch.stack([y[i,l-1, :] for i,l in zip(range(len(y)),lens)], dim=0)
        y = self.sigmoid(self.fc(y))
        return y


class GRUClassification(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        emb_size, 
        hidden_size=128, 
        num_layers=4):

        super(RNNClassification, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(
            input_size=emb_size, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            batch_first=True, 
            bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)
        self.sigmoid = nn.Sigmoid()
        self._initialize_weights()
    
    def _initialize_weights(self):
        nn.init.xavier_uniform_(self.fc.weight)


    def forward(self, x, lens):
        #print(x)
        outs = self.emb(x)
        print("emb:", outs.size())
        outs_packed = pack_padded_sequence(outs, lens, batch_first=True, enforce_sorted=False)
        y, hidden = self.gru(outs_packed)
        y, lens_unpacked = pad_packed_sequence(y, batch_first=True)
        print("rnn:", y.size())
        #y_last = y[:,-1]
        y = torch.stack([y[i,l-1, :] for i,l in zip(range(len(y)),lens)], dim=0)
        y = self.sigmoid(self.fc(y))
        return y



## 4. Main

In [None]:
from tqdm import tqdm
import os
import pickle
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from konlpy.tag import Mecab

#from dataset import Vocab, NSMCDataset
#from models.rnn import RNNClassification
#from models.lstm import LSTMClassification

'''Config'''
model_name = "RNN"
batch_size = 1024
learning_rate = 1e-3
emb_size = 1000
hidden_size = 256
num_layers = 4
num_epochs = 200

'''Data'''
print("Preparing Data ...")
# vocab
if os.path.isfile('./vocab.pkl'):
    vocab_objs = []
    with open('./vocab.pkl', 'rb') as f:
        while True:
            try:
                data = pickle.load(f)
            except EOFError:
                break
            vocab_objs.append(data)
    train_corpus, train_labels, test_corpus, test_labels, token2id, id2token = vocab_objs
else:   
    vocab = Vocab(tokenizer=Mecab(), do_preprocess=True)
    train_corpus, train_labels, test_corpus, test_labels, token2id, id2token = vocab.train_corpus, vocab.train_labels, vocab.test_corpus, vocab.test_labels, vocab.token2id, vocab.id2token

# dataset
train_dataset = NSMCDataset(train_corpus, train_labels, token2id, id2token, tokenizer=Mecab())
test_dataset = NSMCDataset(test_corpus, test_labels, token2id, id2token, tokenizer=Mecab())
print("len train_dataset : ", len(train_dataset))


# dataloader
def collate_fn(batched_samples):
    PAD_TOKEN_ID=0
    corpus, labels = zip(*batched_samples)
    
    input_lengths = []
    for sent in corpus:
        input_lengths.append(len(sent))

    src_sentences = pad_sequence([
        torch.Tensor(sentence).to(torch.long) for sentence in corpus
    ], batch_first=True, padding_value=PAD_TOKEN_ID)

    labels = torch.tensor(labels).unsqueeze(-1)
    return src_sentences, labels, input_lengths

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

vocab_size = train_dataset.vocab_len
print("vocab_size : ", vocab_size)
'''Model'''
print("Set Model, Loss, Optimizer")
#sample_sent, sample_label = next(iter(train_loader))
#print(sample_sent,sample_label)
vocab_size = train_dataset.vocab_len
print("vocab_size : ", vocab_size)

if model_name =="RNN":
    model = RNNClassification(vocab_size, emb_size, hidden_size, num_layers).to(device)
    # model = RNN()
elif model_name =="LSTM":
    model = LSTMClassification(vocab_size, emb_size, hidden_size, num_layers).to(device)
elif model_name =="GRU":
  model = GRUClassification(vocab_size, emb_size, hidden_size, num_layers).to(device)

'''Loss , Optimizer'''
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

print("Training...")
"""Training Loop"""
for epoch in range(num_epochs):
    print("EPOCH: ", epoch+1)
    model.train()
    total_loss = 0
    print("-------Train------")
    for i, (sentences, labels, lens) in enumerate(train_loader):
        # forward
        #print(sentences.unsqueeze(-1).size())
        sentences, labels = sentences.to(device), labels.to(device)
        outputs = model(sentences, lens)

        # loss
        # print(outputs, labels)
        loss = criterion(outputs.to(torch.float32), labels.to(torch.float32))

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, iter {i}/{len(train_loader)} :  train_loss = {loss.item():.4f}")
        print()

    print(f"Epoch {epoch+1}/{num_epochs} :  train_avg_loss = {total_loss/len(train_loader):.4f}")
    print()

    print("-------Eval------")
    model.eval()
    for i, (sentences, labels, lens) in enumerate(test_loader):
        # forward
        sentences, labels = sentences.to(device), labels.to(device)
        outputs = model(sentences, lens)

        # loss
        loss = criterion(outputs.to(torch.float32), labels.to(torch.float32))

        # print
        print(f"Epoch {epoch+1}/{num_epochs}, iter {i}/{len(test_loader)} : eval_loss = {loss.item():.4f}")
        print()


Set Model, Loss, Optimizer
vocab_size :  34377
Training...
EPOCH:  0
-------Train------
emb: torch.Size([1024, 90, 1000])
rnn: torch.Size([1024, 90, 512])
Epoch 1/200, iter 0/147 :  train_loss = 0.7064

emb: torch.Size([1024, 80, 1000])
rnn: torch.Size([1024, 80, 512])
Epoch 1/200, iter 1/147 :  train_loss = 1.0565

emb: torch.Size([1024, 80, 1000])
rnn: torch.Size([1024, 80, 512])
Epoch 1/200, iter 2/147 :  train_loss = 1.3282

emb: torch.Size([1024, 79, 1000])
rnn: torch.Size([1024, 79, 512])
Epoch 1/200, iter 3/147 :  train_loss = 0.7134

emb: torch.Size([1024, 82, 1000])
rnn: torch.Size([1024, 82, 512])
Epoch 1/200, iter 4/147 :  train_loss = 1.0075

emb: torch.Size([1024, 108, 1000])
rnn: torch.Size([1024, 108, 512])
Epoch 1/200, iter 5/147 :  train_loss = 0.7056

emb: torch.Size([1024, 83, 1000])
rnn: torch.Size([1024, 83, 512])
Epoch 1/200, iter 6/147 :  train_loss = 0.7168

emb: torch.Size([1024, 82, 1000])
rnn: torch.Size([1024, 82, 512])
Epoch 1/200, iter 7/147 :  train_loss 