In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from bs4 import BeautifulSoup 

In [2]:
DATA_PATH = "data/"
pretrained_type = 'bert-base-uncased'
seed = 42

test_size = 0.1
device = "cuda"

In [3]:
df = pd.read_csv(DATA_PATH + "Reviews.csv")

In [4]:
max_length_sequence = 256

In [5]:
text = df["Text"].values.tolist()
labels = df["Score"].values.astype(int)

In [6]:
# Since we would like to do positive / negative sentiment prediction we will remove review value 3 since
# it can be viewed as neutral

text = [text[i] for i in range(len(text)) if labels[i] != 3]
labels = np.array([labels[i] for i in range(len(labels)) if labels[i] != 3])
labels = (labels > 3).astype(int) # Binary classification

In [7]:
# Sub-sampling since it would take too long to train otherwise on my computer
num_samples = 10000

np.random.seed(seed)
idx = np.random.choice(np.arange(len(text)), size=num_samples, replace=False)
text = [text[i] for i in idx]
labels = labels[idx]

In [8]:
text_train, text_val, labels_train, labels_val = train_test_split(text, labels, test_size=test_size, random_state=seed)

In [9]:
class Reviews(Dataset):
    def __init__(self, text, labels):
        self.text = text
        self.labels = torch.tensor(labels, dtype=torch.float)
        self.len = len(text)
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_type)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        txt = BeautifulSoup(self.text[idx]).get_text().lower() 
        tokens = self.tokenizer.encode(txt, add_special_tokens=True, max_length=max_length_sequence)
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = self.labels[idx]
        return tokens, label

def collate_fn(batch):
    labels = torch.tensor([b[1] for b in batch])

    lengths = [len(b[0]) for b in batch]
    max_length = min([max(lengths), max_length_sequence])

    attention_mask = torch.zeros((len(batch), max_length), dtype=torch.int)
    idx_tensor = torch.zeros((len(batch), max_length), dtype=torch.long)

    for i in range(len(batch)):
        batch_len = lengths[i]
        batch_len = min([max_length_sequence, batch_len])

        attention_mask[i, 0:batch_len] = 1
        idx_tensor[i, 0:batch_len] = batch[i][0][0:batch_len]

    return idx_tensor, attention_mask, labels

In [10]:
num_workers = 4
batch_size = 7

train_dataset = Reviews(text_train, labels_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, collate_fn=collate_fn)

val_dataset = Reviews(text_val, labels_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                              num_workers=num_workers, collate_fn=collate_fn)

In [11]:
class SentimentClassifier(nn.Module):
    def __init__(self, encoder_dim=768):
        super().__init__()
        self.bert_model = BertModel.from_pretrained(pretrained_type)
        self.linear = nn.Linear(encoder_dim, 1)

    def forward(self, x, attention_mask):
        output = self.bert_model(x, attention_mask=attention_mask)[1]
        output = self.linear(output)
        return output[:, 0]

In [12]:
torch.manual_seed(seed)
model = SentimentClassifier()
model = model.to(device)

In [13]:
epochs = 1
lr = 1e-5

optimizer = optim.Adam(model.parameters(), lr=lr)

In [14]:
loss_fct = nn.BCEWithLogitsLoss()

In [15]:
torch.manual_seed(seed)
for epoch in range(epochs):
    train_loss, val_loss, val_acc = 0.0, 0.0, 0.0

    model.train()
    for _, (idx_tensor, attention_mask, labels) in enumerate(train_dataloader):
        idx_tensor, attention_mask, labels = idx_tensor.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()

        output = model(idx_tensor, attention_mask)

        batch_loss = loss_fct(output, labels)        
        batch_loss.backward()
        optimizer.step()

        train_loss += batch_loss.detach().cpu().numpy()

    model.eval()
    with torch.no_grad():
        for _, (idx_tensor, attention_mask, labels) in enumerate(val_dataloader):
            idx_tensor, attention_mask, labels = idx_tensor.to(device), attention_mask.to(device), labels.to(device)

            output = model(idx_tensor, attention_mask)
            batch_loss = loss_fct(output, labels)
            val_loss += batch_loss.cpu().numpy()

            y_hat = (torch.sigmoid(output) > 0.5).long()
            batch_acc = (y_hat == labels).float().mean()
            val_acc += batch_acc.cpu().numpy()

    train_loss = np.round(train_loss / len(train_dataloader), 6)
    val_loss = np.round(val_loss / len(val_dataloader), 6)
    val_acc = np.round(val_acc / len(val_dataloader), 6)

    print(f"----------- Epoch {epoch} -----------")
    print(f"Train loss: {train_loss}")
    print(f"Validation loss: {val_loss}")
    print(f"Validation accuracy: {val_acc}")

----------- Epoch 0 -----------
Train loss: 0.176774
Validation loss: 0.139315
Validation accuracy: 0.944888
