In [0]:
import torch
from torchtext import data

In [0]:
TEXT = data.Field(include_lengths = False) #tokenize -> Default: string.split
LABEL = data.LabelField(dtype = torch.float)

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
train_data = data.TabularDataset('/content/gdrive/My Drive/Colab Notebooks/pytorch/nsmc/ratings_train.txt', 
                                'tsv', [('id', None), ('text', TEXT), ('label', LABEL)],
                                 skip_header = True)
test_data = data.TabularDataset('/content/gdrive/My Drive/Colab Notebooks/pytorch/nsmc/ratings_test.txt',
                                'tsv', [('id', None), ('text', TEXT), ('label', LABEL)],
                                skip_header = True)

In [5]:
print(vars(train_data[0]))
print(vars(test_data[0]))

{'text': ['아', '더빙..', '진짜', '짜증나네요', '목소리'], 'label': '0'}
{'text': ['굳', 'ㅋ'], 'label': '1'}


In [0]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(0))

In [7]:
print(len(train_data))
print(len(valid_data))

105000
45000


In [0]:
MAX_VOCAB_SIZE = 50000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.200d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size = BATCH_SIZE, device = device, sort = False)

In [0]:
import torch.nn as nn
import torch.nn.functional as F

In [0]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

In [0]:
class CNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, n_filters, filter_sizes,
                 output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.permute(0, 2, 1)
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        return self.fc(cat)

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_FILTERS = 100
FILTER_SIZES = [3,4,5]

In [0]:
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = model.to(device)

In [0]:
def count_parameters(model):
     return sum(p.numel() for p in model.parameters() if p.requires_grad) 

In [16]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-2.0504,  0.1857, -0.3798,  ...,  1.1637, -0.0191, -0.3997],
        [ 0.5016,  0.4119,  0.7296,  ..., -0.9372,  1.2298, -0.2130],
        [-0.8778,  0.6478, -0.3435,  ..., -2.3311,  0.3443,  1.9747],
        ...,
        [-0.6338,  1.3917, -0.3653,  ..., -0.2321, -1.0468, -0.8476],
        [ 1.4290, -0.3819, -0.4043,  ...,  2.0446,  0.0976,  0.6396],
        [-0.8930,  1.5280,  0.6618,  ...,  0.4135,  0.7469, -0.0852]],
       device='cuda:0')

In [0]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [0]:
import torch.optim as optim

In [0]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
for epoch in range(5):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    print(f'{epoch+1} : train {train_loss}, {train_acc*100:.2f}% | valid {valid_loss}, {valid_acc*100:.2f}%')

1 : train 0.5774281379981857, 68.16% | valid 0.4956072386438874, 74.48%
2 : train 0.43163621460134705, 78.83% | valid 0.45745061606761406, 76.91%
3 : train 0.34606267078703024, 83.58% | valid 0.46329613626849925, 77.73%
4 : train 0.28673413360921585, 86.47% | valid 0.48848702417771245, 77.70%
5 : train 0.2424088269653733, 88.45% | valid 0.5518882861572572, 77.46%
