In [1]:
import spacy
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [2]:
DATA_PATH = "data/"
seed = 42
device = "cuda"

embedding_dim = 300
test_size = 0.1
max_length_sequence = 128

In [3]:
df = pd.read_csv(DATA_PATH + "Reviews.csv")

In [4]:
text = df["Text"].values.tolist()
labels = df["Score"].values.astype(int)

In [5]:
# Since we would like to do positive / negative sentiment prediction we will remove review value 3 since
# it can be viewed as neutral

text = [text[i] for i in range(len(text)) if labels[i] != 3]
labels = np.array([labels[i] for i in range(len(labels)) if labels[i] != 3])
labels = (labels > 3).astype(int) # Binary classification

In [6]:
# Sub-sampling since it would take too long to train otherwise on my computer
num_samples = 10000

np.random.seed(seed)
idx = np.random.choice(np.arange(len(text)), size=num_samples, replace=False)
text = [text[i] for i in idx]
labels = labels[idx]

In [7]:
text_train, text_val, labels_train, labels_val = train_test_split(text, labels, test_size=test_size, random_state=seed)

In [8]:
class Reviews(Dataset):
    def __init__(self, text, labels):
        self.len = len(text)
        self.tokenizer = spacy.load("en_core_web_md")
        self.vectors = self.text_to_vectors(text)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def text_to_vectors(self, text):
        vector_list = []
        
        for txt in text:
            txt = BeautifulSoup(txt).get_text().lower() 
            tokens = self.tokenizer(txt)

            vectors = []
            for token in tokens:
                if not token.is_oov:
                    vectors.append(token.vector)

            vectors = np.vstack(vectors)
            vectors = torch.from_numpy(vectors).float()
            vector_list.append(vectors)
        return vector_list

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        vector = self.vectors[idx]
        label = self.labels[idx]
        return vector, label


def collate_fn(batch):
    labels = torch.tensor([b[1] for b in batch])

    lengths = [len(b[0]) for b in batch]
    max_length = min([max(lengths), max_length_sequence])

    vector_tensor = torch.zeros((len(batch), max_length, embedding_dim))
    mask = torch.zeros((len(batch), max_length, 1), dtype=torch.int)

    for i in range(len(batch)):
        batch_len = lengths[i]
        batch_len = min([max_length_sequence, batch_len])
        
        mask[i, (max_length - batch_len):, :] = 1
        vector_tensor[i, (max_length - batch_len):, :] = batch[i][0][0:batch_len, :]

    return vector_tensor, mask, labels

In [9]:
num_workers = 4
batch_size = 128

train_dataset = Reviews(text_train, labels_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, collate_fn=collate_fn)

val_dataset = Reviews(text_val, labels_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                              num_workers=num_workers, collate_fn=collate_fn)

In [10]:
class Model(nn.Module):
    def __init__(self, hidden_dim=128, p=0.2):
        super().__init__()
        
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(p=p)
        self.linear = nn.Linear(3 * hidden_dim, 1)

    def forward(self, x, mask):
        output, _ = self.lstm(x)
        mask = mask.repeat(1, 1, output.shape[-1])
        
        # Concat the last hidden output, mean & max over all hidden outputs.
        output = torch.cat([
            torch.sum(mask * output, dim=1) / torch.sum(mask, dim=1),
            torch.max(mask * output, dim=1)[0], # Assuming 0 won't be max
            output[:, -1, :]
        ], dim=1)
        
        output = self.dropout(output)
        output = self.linear(output)
        return output[:, 0]

In [11]:
torch.manual_seed(seed)
model = Model()
model = model.to(device)

In [12]:
epochs = 5
lr = 1e-3
wd = 1e-4

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [13]:
loss_fct = nn.BCEWithLogitsLoss()

In [14]:
torch.manual_seed(seed)
for epoch in range(epochs):
    train_loss, val_loss, val_acc = 0.0, 0.0, 0.0

    model.train()
    for _, (vectors_tensor, mask, labels) in enumerate(train_dataloader):
        vectors_tensor, mask, labels = vectors_tensor.to(device), mask.to(device), labels.to(device)
        optimizer.zero_grad()

        output = model(vectors_tensor, mask)

        batch_loss = loss_fct(output, labels)        
        batch_loss.backward()
        optimizer.step()

        train_loss += batch_loss.detach().cpu().numpy()

    model.eval()
    with torch.no_grad():
        for _, (vectors_tensor, mask, labels) in enumerate(val_dataloader):
            vectors_tensor, mask, labels = vectors_tensor.to(device), mask.to(device), labels.to(device)

            output = model(vectors_tensor, mask)
            batch_loss = loss_fct(output, labels)
            val_loss += batch_loss.cpu().numpy()

            y_hat = (torch.sigmoid(output) > 0.5).long()
            batch_acc = (y_hat == labels).float().mean()
            val_acc += batch_acc.cpu().numpy()

    train_loss = np.round(train_loss / len(train_dataloader), 6)
    val_loss = np.round(val_loss / len(val_dataloader), 6)
    val_acc = np.round(val_acc / len(val_dataloader), 6)

    print(f"----------- Epoch {epoch} -----------")
    print(f"Train loss: {train_loss}")
    print(f"Validation loss: {val_loss}")
    print(f"Validation accuracy: {val_acc}")

----------- Epoch 0 -----------
Train loss: 0.428948
Validation loss: 0.387902
Validation accuracy: 0.837515
----------- Epoch 1 -----------
Train loss: 0.32849
Validation loss: 0.312759
Validation accuracy: 0.857948
----------- Epoch 2 -----------
Train loss: 0.271902
Validation loss: 0.269043
Validation accuracy: 0.887019
----------- Epoch 3 -----------
Train loss: 0.235976
Validation loss: 0.260857
Validation accuracy: 0.878681
----------- Epoch 4 -----------
Train loss: 0.222214
Validation loss: 0.236063
Validation accuracy: 0.896484
