# IMDB 평점 데이터 분류하기
- 2023.07.19

- pytorch에서 데이터를 로드할 떄 보통 전처리 -> dataset -> DataLoader에 적재 과정을 거친다고 함
- 텍스트 분류에서 필요한 전처리는 토큰화 등이 있음
- IMDB 데이터는 (label, data)로 구성되며 label은 별점이 7점 이상인 경우 1, 4점 이하인 경우 2임

### 라이브러리 임포트, 전역 변수 설정

In [11]:
# 라이브러리 임포트
import torch
import torch.nn as nn
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

from torch.utils.data import DataLoader, random_split

import random

from typing import Iterable, Tuple, Dict

# 디바이스 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# 하이퍼파라미터 설정
BATCH_SIZE = 64
lr = 1e-3
EPOCHS = 10

# 랜덤 시드 설정
SEED = 5
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x2a0feb8f190>

### 데이터 로드
- IMDB 데이터를 generator로 불러오기 때문에, 여러 번 불러와도 큰 성능 저하가 발생하지 않음

In [12]:
def get_data() -> Iterable[Tuple[int, str]]:
    """
    get IMDB dataset.

    Args, Exceptions
    ----------------
    None


    Returns
    -------
    dataset: Iterable
    """
    train, test = IMDB(root="./data/imdb/")
    result = iter(train + test)

    return result

### 데이터 살펴보기
- 평점이 7점 이상인 경우 2, 평점이 4점 이하인 경우 1로 레이블링 되어 있다고 함

In [5]:
def view_head(dataset: Iterable[Tuple[int, str]], length: int=5) -> None:
    """
    view head of dataset.

    Args
    ----
    dataset: Iterable, contains Tuple[label, data]
    length: int, default is 5, length of datas to view.

    Exceptions
    ----------
    None

    returns
    -------
    None
    """
    len_data = length

    for label, data in dataset:
        print(f"label: {label}")
        print(f"data: {data}")

        len_data -= 1
        if len_data <= 0:
            break

    return


def label_value_counts(dataset: Iterable[Tuple[int, str]]) -> Dict[str, int]:
    """
    view head of dataset.

    Args
    ----
    dataset: Iterable, contains Tuple[label, data]

    Exceptions
    ----------
    None

    returns
    -------
    counts: Dict[label: count]
    """
    counts = dict()

    for label, _ in dataset:
        if label not in counts.keys():
            counts[label] = 1
            continue

        counts[label] += 1
    
    return counts


In [6]:
dataset = get_data()
view_head(dataset)

label: 1
data: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and f

In [7]:
dataset = get_data()
label_value_counts(dataset)

{1: 25000, 2: 25000}

### 데이터 전처리

#### raw dataset으로부터 어휘집: vocab 만들기

In [13]:
from typing import Generator

dataset = get_data()

# 문자열 토큰화
tokenizer = get_tokenizer("basic_english")

# generator 데이터의 각 요소에 접근하여 token을 생성하는 generator
def yield_tokens(data_iter: Iterable) -> Generator:
    '''
    get tokenized list from dataset.
    Args
    ----
    data_iter: Iterable[label, text], target dataset.

    Exceptions
    ----------
    None

    Returns
    -------
    tokenized text Generator
    '''
    for _, text in data_iter:
        yield tokenizer(text)

# 토큰화된 데이터로부터 vocab 생성
vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

#### vocab은 토큰화된 목록을 정수로 변환함

In [14]:
vocab(tokenizer("here is an example"))

[135, 9, 40, 471]

#### 텍스트 처리 파이프라인 준비

In [24]:
# 텍스트의 경우 token화 후 vocab을 돌려 전처리
text_pipeline = lambda x: vocab(tokenizer(x))

# 라벨의 경우 특별한 처리는 없음 - 1과 2를 0과 1로 변환
label_pipeline = lambda x: int(x) - 1

dataLoader의 collate_fn은 batch로 데이터를 불러올 때 호출되는 메서드임

In [25]:
def collate_batch(batch):
    label_list, text_list, offset_list = [], [], [0]

    for (_label, _text) in batch:
        # 라벨 데이터는 그냥 담고
        label_list.append(label_pipeline(_label))

        # 전처리된 텍스트 데이터는 tensor로 변환
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)

        text_list.append(processed_text)

        # offset -> 해당 개별 텍스트의 길이를 담음
        offset_list.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list)
    offsets = torch.tensor(offset_list[:-1]).cumsum(dim=0)

    return label_list.to(device), text_list.to(device), offsets.to(device)

In [26]:
# IMDB에서 제공하는 train, test의 크기가 모두 25000이고 레이블이 있는 데이터임
# 따라서 그냥 두 데이터를 합치고 난 후 따로 분리하는 방법을 택함

dataset = to_map_style_dataset(get_data())

num_train = int(len(dataset) * 0.7)
split_train, split_others = random_split(dataset, [num_train, len(dataset) - num_train])
# others: 3 / 10

# whole: 3/10 -> 2/10 for whole is 0.666666...
num_test = int(len(split_others) * 0.6)
split_test, split_valid = random_split(split_others, [num_test, len(split_others) - num_test])

train_iter, test_iter, vaild_iter = map(iter, [split_train, split_test, split_valid])

train_dataloader = DataLoader(split_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(split_test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

### RNN 모델 구현
- Word Embedding: 단어를 밀집 벡터의 형태로 표현하는 방법
- nn.Embedding은 단어를 임의의 값을 가지는 밀집 벡터로 변환한 뒤 신경망의 가중치를 학습하는 것과 받은 방식으로
단어 벡터를 학습하는 방법을 사용
- nn.EmbeddingBag는 Embedding 후 torch.mean(dim=0)과 동일하며, Embedding의 평균을 즉시 계산하여 성능 및 메모리 측면에서의 장점을 가짐

In [28]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_class):
        super(RNNModel, self).__init__()

        self.hidden_size = hidden_size
        self.rnn = nn.RNN(vocab_size, hidden_size)
        self.fc = nn.Linear(hidden_size, num_class)

        self.embedding = nn.EmbeddingBag(vocab_size, hidden_size, sparse=False)
        self.init_weights()


    def init_weights(self):
        initrange = 0.5

        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

        return


    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


In [32]:
model = RNNModel(
    vocab_size=len(vocab),
    hidden_size=128,
    num_class=2,
).to(device)

### 모델 훈련 메서드

In [33]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

### 모델 훈련

In [35]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

print("start training.")
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch, time.time() - epoch_start_time, accu_val))
    print('-' * 59)

print("training finished.")

start training.
| epoch   1 |   500/  547 batches | accuracy    0.734
-----------------------------------------------------------
| end of epoch   1 | time: 20.45s | valid accuracy    0.804 
-----------------------------------------------------------
| epoch   2 |   500/  547 batches | accuracy    0.805
-----------------------------------------------------------
| end of epoch   2 | time: 19.63s | valid accuracy    0.806 
-----------------------------------------------------------
| epoch   3 |   500/  547 batches | accuracy    0.829
-----------------------------------------------------------
| end of epoch   3 | time: 18.60s | valid accuracy    0.828 
-----------------------------------------------------------
| epoch   4 |   500/  547 batches | accuracy    0.842
-----------------------------------------------------------
| end of epoch   4 | time: 19.11s | valid accuracy    0.864 
-----------------------------------------------------------
| epoch   5 |   500/  547 batches | accuracy

In [36]:
some_data = next(iter(get_data()))

In [49]:
some_x = text_pipeline(some_data[1])
some_x = torch.tensor(some_x, dtype=torch.int64)
model.eval()

with torch.no_grad():
    result = model(some_x, torch.zeros(size=(1, ), dtype=torch.int64))
    print(result)

tensor([[ 0.1489, -0.1999]])
