In [1]:
import time
import copy
from collections import OrderedDict

import torch
import torch.nn as nn
from torchtext.datasets import IMDB
from torchtext.vocab import vocab
from torch.utils.data import random_split, TensorDataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_data = IMDB(split='train')
test_data = IMDB(split='test')
train_data, valid_data = random_split(list(train_data), [20_000, 5_000])



In [3]:
def prepare_data_loader(data, batch_size=64):
    def extract_texts(data):
        for _, text in data:
            yield text

    def extract_labels(data):
        for label, _ in data:
            yield 0 if label == 'neg' else 1
        
    vectorizer = CountVectorizer()
    return DataLoader(TensorDataset(
        torch.tensor(vectorizer.fit_transform(extract_texts(data)).todense(), dtype=torch.float16), 
        torch.tensor(list(extract_labels(data)), dtype=torch.float16)),
                      batch_size=64, shuffle=True)

In [4]:
train_dl = prepare_data_loader(train_data)
valid_dl = prepare_data_loader(valid_data)
test_dl = prepare_data_loader(test_data)

In [5]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        fc_1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        leaky_relu = nn.LeakyReLU()
        fc_2 = nn.Linear(fc_hidden_size, 1)
        sigmoid = nn.Sigmoid()
        self.fc = nn.Sequential(*[fc_1, leaky_relu, fc_2, sigmoid])
    
    def forward(self, x):
        out = self.embedding(x)
        _, (hidden, _) = self.rnn(out)
        return self.fc(hidden[-1, :, :])

In [6]:
def train_model(model, criterion, optimizer, data_loader, num_epochs, epoch_print=1):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(1, num_epochs + 1):
        if epoch == 1 or epoch % epoch_print == 0:
            print(f'\nEpoch {epoch}/{num_epochs}')
            print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in data_loader[phase]:

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)[:, 0]
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum((outputs > 0.5) == labels.data)

            epoch_loss = running_loss / len(data_loader[phase].dataset)
            epoch_acc = running_corrects.double() / len(data_loader[phase].dataset)

            if epoch == 1 or epoch % epoch_print == 0:
                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

                
    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


In [7]:
vocab_size = train_dl.dataset.tensors[0].shape[1]
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

In [8]:
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters())

In [9]:
model = train_model(model, criterion, optimizer, {'train': train_dl, 'val': valid_dl}, 1)


Epoch 1/1
----------


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.HalfTensor instead (while checking arguments for embedding)

In [None]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()
    
    with torch.no_grad():
        for x, labels in loader:model = train_model(model, criterion, optimizer, {'train': train_dl, 'val': valid_dl}, 20)
            
            outputs = model(x)[:, 0]
            num_correct += torch.sum((outputs > 0.5) == labels.data)
            num_samples += outputs.size(0)
        
        print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')

In [None]:
check_accuracy(test_dl, model)