In [0]:
import torch
from torchtext import data

In [0]:
TEXT = data.Field(include_lengths = False) #tokenize -> Default: string.split
LABEL = data.LabelField(dtype = torch.float)

In [21]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
train_data = data.TabularDataset('/content/gdrive/My Drive/Colab Notebooks/pytorch/nsmc/ratings_train.txt', 'tsv', [('id', None), ('text', TEXT), ('label', LABEL)])
test_data = data.TabularDataset('/content/gdrive/My Drive/Colab Notebooks/pytorch/nsmc/ratings_test.txt', 'tsv', [('id', None), ('text', TEXT), ('label', LABEL)])

In [6]:
print(vars(train_data[0]))
print(vars(test_data[0]))

{'text': ['아', '더빙..', '진짜', '짜증나네요', '목소리'], 'label': '0'}
{'text': ['굳', 'ㅋ'], 'label': '1'}


In [0]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(0))

In [0]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)

LABEL.build_vocab(train_data)

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size = BATCH_SIZE, device = device, sort=False)

In [0]:
import torch.nn as nn

In [0]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

In [0]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

In [0]:
import torch.optim as optim

In [0]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    with torch.no_grad():

        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
for epoch in range(5):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print(f'{epoch+1} : {train_loss}, {train_acc*100:.2f}%')

1 : 0.69652831641586, 50.37%
2 : 0.6950905021554736, 50.29%
3 : 0.6942867297431741, 50.45%
4 : 0.6943764006997085, 50.74%
5 : 0.6929369126421007, 50.68%
