In [4]:
import torch
import numpy as np
from torch import nn
import pandas as pd
import transformers
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import pdb

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
device

In [None]:
dataset = pd.read_csv('TestReviews.csv')
dataset.head()

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
model = transformers.AutoModel.from_pretrained("bert-base-uncased") # max_length=1024

In [None]:
sentence = ["I love dragons", "I hate snakes (why?) ok ok ok"]
model(**tokenizer(sentence, return_tensors='pt', padding=True))['last_hidden_state'].shape

In [9]:
class ReviewDataset(Dataset):
    def __init__(self, dataset, tokenizer, model):
        self.reviews = dataset['review'].values
        self.labels = dataset['class'].values
        self.tokenizer = tokenizer
        self.model = model

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(review, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = self.model(**inputs)['last_hidden_state']
        return inputs, label


In [None]:
review_dataset = ReviewDataset(dataset, tokenizer, model)
train_set, valid_set = random_split(review_dataset, [0.8, 0.2])
train_set[0][0].shape, train_set[0][1]

In [11]:
def collate_fn(batch):
  embeddings, targets = zip(*batch)
  sizes = torch.tensor([seq.size(1) for seq in embeddings])
  max_size = sizes.max()

  padded_embeddings = [F.pad(embed, (0, 0, 0, max_size - embed.size(1)), 'constant', 0)
                      for embed in embeddings]
  #pdb.set_trace()
  padded_embeddings = torch.cat(padded_embeddings).to(device)
  targets = torch.tensor(targets).reshape(-1, 1).to(torch.float32)

  return padded_embeddings, sizes, targets


batch_size = 16
train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [None]:
sample = next(iter(train_loader))
sample[0].shape, sample[1], sample[2]

In [13]:
class Net(torch.nn.Module):
    def __init__(self, n_dim):
        super().__init__()
        self.rnn1 = nn.LSTM(n_dim, 128, batch_first=False, num_layers=3, bidirectional=True)
        self.layers = nn.Sequential(
            nn.Linear(256, 128),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x, input_lengths):
        packed_inputs = pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False)
        packed_outputs, _ = self.rnn1(packed_inputs)
        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)
        outputs = self.layers(outputs)[:, -1, :] # if you want only the last state
        return outputs

In [14]:
model = Net(768).to(device)

In [None]:
padded_embeddings, sizes, targets = next(iter(train_loader))
model(padded_embeddings, sizes)

In [18]:
# a function to train a model
# you may need to convert tensors to float32 (before criterium)
# and .to(device)

import matplotlib.pyplot as plt
from IPython.display import clear_output
import copy

def compute_error(model, data_loader, criterion, c_sum=False):
    model.eval()
    losses, num_of_el = 0, 0
    with torch.no_grad():
        for x, sizes, y in data_loader:
            #x = x.to(device)
            y = y.to(device)
            outputs = model(x, sizes)
            loss = criterion(outputs, y)
            if not c_sum: loss *= len(y)
            losses += loss
            num_of_el += len(y)
    return losses / num_of_el


def train_model(model: nn.Module,
              train_loader: DataLoader,
              valid_loader: DataLoader,
              num_epochs: int,
              optimizer: torch.optim.Optimizer,
              criterion,
              verbose: bool = True,
              verbose_plot: bool = False
              ) -> float:

    best_epoch = None
    best_params = None
    best_val_loss = np.inf
    train_losses, valid_losses = [], []

    for epoch in range(num_epochs):
        model.train()
        _iter = 1
        for inputs, sizes, targets in train_loader:
            #inputs = inputs.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs, sizes)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            if verbose:
                if _iter % 1 == 0:
                    print(f"Minibatch {_iter:>6}    |  loss {loss.item():>5.2f}  |")
            _iter += 1

        val_loss = compute_error(model, valid_loader, criterion)

        if val_loss < best_val_loss:
            best_epoch = epoch
            best_val_loss = val_loss
            best_params = [copy.deepcopy(p.detach().cpu()) for p in model.parameters()]

        if verbose:
            clear_output(True)
            m = f"After epoch {epoch:>2} | valid loss: {val_loss:>5.2f}"
            print("{0}\n{1}\n{0}".format("-" * len(m), m))

        if verbose_plot:
            train_loss = compute_error(model, train_loader, criterion)
            train_losses.append(train_loss.detach().cpu())
            valid_losses.append(val_loss.detach().cpu())

    if best_params is not None:
        if verbose:
            print(f"\nLoading best params on validation set in epoch {best_epoch} with loss {best_val_loss:.2f}")
        with torch.no_grad():
            for param, best_param in zip(model.parameters(), best_params):
                param[...] = best_param

    if verbose_plot:
        plt.figure(figsize=(6, 3))
        plt.plot(train_losses, c='b', label='train')
        plt.plot(valid_losses, c='r', label='valid')
        plt.grid(ls=':')
        plt.legend()
        plt.show()

    return best_val_loss

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

train_model(model, train_loader, valid_loader, 1, optimizer, criterion, True, True)

In [None]:
def accuracy_binary(outputs, y, threshold=0.5):
    pred = outputs > threshold
    return sum(pred == y)

compute_error(model, valid_loader, accuracy_binary, True)