In [1]:
import torch
from torchtext import data, datasets

TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)

train, test = datasets.IMDB.splits(TEXT, LABEL)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torchtext.vocab import GloVe

TEXT.build_vocab(train, max_size=25000, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

LABEL.build_vocab(train)

In [3]:
train_iterator, test_iterator = data.BucketIterator.splits((train, test), batch_size=64, sort_within_batch=True)

for amostra in test_iterator:
    batch, tamanhos = amostra.text
    rotulo = amostra.label

    print(batch.shape)
    print(tamanhos)
    print(rotulo.shape)
    break

torch.Size([64, 30])
tensor([30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28,
        28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 24, 24,
        24, 24, 24, 23, 23, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18,
        17, 16, 16, 14, 10,  9,  8,  8,  6,  4])
torch.Size([64])


In [24]:
from torch import nn, optim
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, embed_vectors, ind_unk, ind_pad, hidden_size, output_size):
        super(RNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight.data.copy_(embed_vectors)
        self.embedding.weight.data[ind_unk] = torch.zeros(embed_dim)
        self.embedding.weight.data[ind_pad] = torch.zeros(embed_dim)
        
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, X, tamanhos):
        embed = self.embedding(X)

        hidden = torch.zeros(1, X.size(0), self.hidden_size, device=X.device)

        packed_input = nn.utils.rnn.pack_padded_sequence(embed, tamanhos.cpu(), batch_first=True, enforce_sorted=False)

        packed_output, hidden = self.rnn(packed_input, hidden)

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        output = self.linear(hidden.squeeze())
        return output
        

vocab_size = len(TEXT.vocab)
embed_dim = TEXT.vocab.vectors[0].shape[0]
embed_vectors = TEXT.vocab.vectors
ind_unk = TEXT.vocab.stoi[TEXT.unk_token]
ind_pad = TEXT.vocab.stoi[TEXT.pad_token]
hidden_size = 256
output_size = 2

model = RNN(vocab_size, embed_dim, embed_vectors, ind_unk, ind_pad, hidden_size, output_size)
print(model)
print(vocab_size, embed_dim, embed_vectors, ind_unk, ind_pad, hidden_size)

RNN(
  (embedding): Embedding(25002, 300)
  (rnn): GRU(300, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=2, bias=True)
)
25002 300 tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.1110, -0.4602,  0.4668,  ...,  0.5124,  0.0567, -0.1228],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2599,  0.8047, -0.6322,  ..., -0.5477, -0.3557, -0.0078]]) 0 1 256


In [25]:
for sample in test_iterator:
    texto, tamanhos = sample.text
    label = sample.label

    saida = model(texto, tamanhos)
    break

In [26]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=5e-5)

In [27]:
import numpy as np

def forward(iterator, num_samples, etapa, epoca):
    if etapa == 'Treino': model.train()
    else: model.eval()
    
    acuracia = 0.
    loss_epoca = []
    
    for k, sample in enumerate(iterator):
        texto, tamanhos = sample.text
        rotulo = sample.label - 1
        
        saida = model(texto, tamanhos.cpu())
        
        loss = criterion(saida, rotulo)
        loss_epoca.append(loss.detach().cpu().numpy())
        
        _, pred = torch.max(saida, axis= -1)
        acuracia += (pred.cpu().data == rotulo.cpu().data).sum()

        if etapa == 'Treino':
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
    loss_epoca = np.array(loss_epoca)
    acuracia   = acuracia/float(num_samples)
    print('\n', '*'*15 + etapa + '*'*15)
    print('Epoca: {:}, Loss: {:.2f} +- {:.2f}, Acc: {:.2f}'.format(epoca, loss.mean(), loss.std(), acuracia*100))
    
    return float(loss.mean()), float(acuracia)

In [None]:
loss_treino, loss_teste = [], []
acc_treino, acc_teste = [], []

for epoca in range(25):
    loss, acc = forward(train_iterator, len(train), 'Treino', epoca)
    loss_treino.append(loss)
    acc_treino.append(acc)
    
    loss, acc = forward(test_iterator, len(test), 'Teste', epoca)
    loss_teste.append(loss)
    acc_teste.append(acc)