In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [None]:
import logging
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

In [None]:
DATA_PATH = "data/"
pretrained_type = 'bert-base-uncased'
seed = 42

test_size = 0.1
device = "cuda"

In [None]:
df = pd.read_csv(DATA_PATH + "Reviews.csv")

In [None]:
max_length_sequence = 256

In [None]:
text = df["Text"].values.tolist()
labels = df["Score"].values.astype(int)
labels = (labels > 3).astype(int) # Binary classification

In [None]:
text = text[0:1000]
labels = labels[0:1000]

In [None]:
text_train, text_val, labels_train, labels_val = train_test_split(text, labels, test_size=test_size, random_state=seed)

In [None]:
class Reviews(Dataset):
    def __init__(self, text, labels):
        self.text = text
        self.labels = torch.tensor(labels, dtype=torch.float)
        self.len = len(text)
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_type)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        txt = re.sub('<[^<]+?>', '', self.text[idx]) # Removing html-tags
        tokens = self.tokenizer.encode(txt, add_special_tokens=True)
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = self.labels[idx]
        return tokens, label


def collate_fn(batch):
    labels = torch.tensor([b[1] for b in batch])

    lengths = [len(b[0]) for b in batch]
    max_length = min([max(lengths), max_length_sequence])

    attention_mask = torch.zeros((len(batch), max_length), dtype=torch.int)
    idx_tensor = torch.zeros((len(batch), max_length), dtype=torch.long)

    for i in range(len(batch)):
        batch_len = lengths[i]
        batch_len = min([max_length_sequence, batch_len])

        attention_mask[i, 0:batch_len] = 1
        idx_tensor[i, 0:batch_len] = batch[i][0][0:batch_len]

    return idx_tensor, attention_mask, labels

In [None]:
num_workers = 4
batch_size = 128

train_dataset = Reviews(text_train, labels_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, collate_fn=collate_fn)

val_dataset = Reviews(text_val, labels_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                              num_workers=num_workers, collate_fn=collate_fn)

In [None]:
def nanmean(v, *args, inplace=False, **kwargs):
    """
    Taking mean of tensor with nan's, excluding them from the computation.
    """

    if not inplace:
        v = v.clone()
    is_nan = torch.isnan(v)
    v[is_nan] = 0
    return v.sum(*args, **kwargs) / (~is_nan).float().sum(*args, **kwargs)

def nanmax(v, *args, inplace=False, **kwargs):
    """
    Taking max of tensor with nan's, excluding them from the computation.
    """

    if not inplace:
        v = v.clone()
    is_nan = torch.isnan(v)
    v[is_nan] = -float("inf")
    return v.max(*args, **kwargs)[0]

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, encoder_dim=768):
        super().__init__()
        self.bert_model = BertModel.from_pretrained(pretrained_type)
        self.change_freezing()
        self.linear = nn.Linear(2 * encoder_dim, 1)
 
    def change_freezing(self, mode=False):
        for param in self.bert_model.parameters():
            param.requires_grad = mode

    def train(self, mode=True):
        super().train(mode)

        for m in self.bert_model.modules():
            if isinstance(m, nn.Dropout):
                m.eval()
            elif isinstance(m, nn.LayerNorm):
                m.eval()

    def forward(self, x, mask):
        seq = self.bert_model(x)[0]
        seq[~mask.bool()] = float("nan")

        output = torch.cat([
            nanmean(seq, axis=1),
            nanmax(seq, axis=1)
            ], dim=1)

        output = self.linear(output)        
        return output[:, 0]

In [None]:
model = SentimentClassifier()
model = model.to(device)

In [None]:
epochs = 2
lr = 1e-3

optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

In [None]:
loss_fct = nn.BCEWithLogitsLoss()

In [None]:
for epoch in range(epochs):
    train_loss, val_loss, val_acc = 0.0, 0.0, 0.0

    model.train()
    for _, (idx_tensor, attention_mask, labels) in enumerate(train_dataloader):
        idx_tensor, attention_mask, labels = idx_tensor.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()

        output = model(idx_tensor, attention_mask)

        batch_loss = loss_fct(output, labels)        
        batch_loss.backward()
        optimizer.step()

        train_loss += batch_loss.detach().cpu().numpy()

    model.eval()
    with torch.no_grad():
        for _, (idx_tensor, attention_mask, labels) in enumerate(val_dataloader):
            idx_tensor, attention_mask, labels = idx_tensor.to(device), attention_mask.to(device), labels.to(device)

            output = model(idx_tensor, attention_mask)
            batch_loss = loss_fct(output, labels)
            val_loss += batch_loss.detach().cpu().numpy()

            y_hat = (torch.sigmoid(output) > 0.5).long()
            batch_acc = (y_hat == labels).float().mean()
            val_acc += batch_acc.cpu().detach().numpy()

    train_loss = np.round(train_loss / len(train_dataloader), 6)
    val_loss = np.round(val_loss / len(val_dataloader), 6)
    val_acc = np.round(val_acc / len(val_dataloader), 6)

    print(f"----------- Epoch {epoch} -----------")
    print(f"Train loss: {train_loss}")
    print(f"Validation loss: {val_loss}")
    print(f"Validation accuracy: {val_acc}")