# LSTM for Amazon Reviews

In [2]:
# import packages
import bz2
import re
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator 
from torchtext.data.utils import get_tokenizer;

Read reviews files

In [3]:
# Open compressed files contacting train and test sentences from amazon reviews
train_file = bz2.BZ2File('./train.ft.txt.bz2')
test_file = bz2.BZ2File('./test.ft.txt.bz2')

# Read the content of the files. These files can be downloaded form:
# https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
train_file = train_file.readlines()
test_file = test_file.readlines()

# Cut the datasets to the desired lengths and decode them
num_train = 800000  # We're training on the first 800,000 reviews in the dataset
num_test = 400000  # Using 400,000 reviews from the test set (the entire set)
train_set = [x.decode('utf-8') for x in train_file[:num_train]]
test_set = [x.decode('utf-8') for x in test_file[:num_test]]

In [4]:
# TODO: Read some reviews from the dataset using ind with different values
# Which label (1 or 2) is for positive review and which is for negative?
ind = 8
print(train_set[ind])

__label__2 A FIVE STAR BOOK: I just finished reading Whisper of the Wicked saints. I fell in love with the caracters. I expected an average romance read, but instead I found one of my favorite books of all time. Just when I thought I could predict the outcome I was shocked ! The writting was so descriptive that my heart broke when Julia's did and I felt as if I was there with them instead of just a distant reader. If you are a lover of romance novels then this is a must read. Don't let the cover fool you this book is spectacular!



# answer
Which label (1 or 2) is for positive review and which is for negative?

2 is a positive review label, 1 is for negative

The next step is to split the labels form the sentences and prepare the sentences for tokenization, e.g., replace URLs with a URL token

In [7]:
def pre_processing(dataset):

    # Extracting labels from sentences
    labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in dataset]
    sentences = [x.split(' ', 1)[1][:-1].lower() for x in dataset] # note the transformation for lowercase 

    # Some simple cleaning of data
    for i in range(len(sentences)):
        sentences[i] = re.sub('\d','0', sentences[i])

    # Modify URLs to <url>
    for i in range(len(sentences)):
        if 'www.' in sentences[i] or 'http:' in sentences[i] or 'https:' in sentences[i] or '.com' in sentences[i]:
            sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", sentences[i])
    
    return labels, sentences


train_labels, train_sentences = pre_processing(train_set)
test_labels, test_sentences = pre_processing(test_set)

After preparing the sentences we tokenize them

In [8]:
# TODO: Get torchtext tokenizer for basic english
tokenizer = get_tokenizer("basic_english")

# Choose sentences to see their tokenization
ind = 0
print('%--------Original sentence--------%')
print(train_sentences[ind])
print('\n%--------Tokenized sentence--------%')
print(tokenizer(train_sentences[ind]))

%--------Original sentence--------%
stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^

%--------Tokenized sentence--------%
['stuning', 'even', 'for', 'the', 'non-gamer', 'this', 'sound', 'track', 'was', 'beautiful', '!', 'it', 'paints', 'the', 'senery', 'in', 'your', 'mind', 'so', 'well', 'i', 'would', 'recomend', 'it', 'even', 'to', 'people', 'who', 'hate', 'vid', '.', 'game', 'music', '!', 'i', 'have', 'played', 'the', 'game', 'chrono', 'cross', 'but', 'out', 'of', 'all', 'of', 'the', 'games', 'i', 'have', 'ever', 'played', 'it', 'has', 'the', 'best', 'music', '!', 'it', 'backs', 'away', 'from', 'crude', 'keyboar

Then we build a dictionary containing words/tokens that we have in the training set

In [15]:
# Set the maximal number of distinct tokens in the vocabulary
max_num_tokens = int(5e4)

# Define an iterator that yields the tokens
def yield_tokens(data_iter):
    for text in data_iter:
        
        # TODO: Tokenize the text using the tokenizer
        tokenized_text = tokenizer(text)

        yield tokenized_text


# Define a function that build a vocabulary from dataset
def get_vocab(train_datapipe, _max_tokens=None): 
    vocab = build_vocab_from_iterator(yield_tokens(train_datapipe), min_freq=2, specials=["<pad>","<unk>"], max_tokens=_max_tokens)

    # TODO: Set the default index of the vocabulary to the index of <unk>
    vocab.set_default_index(vocab["<unk>"])

    return vocab

# Get the vocabulary for the Amazon Reviews training set
vocab = get_vocab(iter(train_sentences), max_num_tokens)

print('Size of the vocabulary: ', len(vocab))

Size of the vocabulary:  50000


In [19]:
# TODO: See what is the index for the following tokens in the vocabulary: 'what', 'day', 'yellow', 'perpetual'
tokens = ['what', 'day', 'yellow', 'perpetual']
for token in tokens:
    print("the token "+token+":", vocab[token])
# At what order the dictionary is?

the token what: 54
the token day: 224
the token yellow: 2514
the token perpetual: 16460


# answer 
the words are in order of the number of times seen, meaning the more they apeared in the reviews the lower index they have.

In [20]:
# TODO: see what is the tokens for the first 10 indices
for ind in range(10):
    print("token number " +str(ind)+" is:", vocab.lookup_token(ind))

token number 0 is: <pad>
token number 1 is: <unk>
token number 2 is: .
token number 3 is: the
token number 4 is: ,
token number 5 is: i
token number 6 is: and
token number 7 is: a
token number 8 is: to
token number 9 is: it


In [21]:
# Special tokens
print("special tokens:")
print("<pad>: ", vocab['<pad>'])
print("<unk>: ", vocab['<unk>'])

special tokens:
<pad>:  0
<unk>:  1


<generator object yield_tokens at 0x7f7ce31d5540>

Given the dictionary, we will set the final tokenization of the training set and the test set. Note that now we will use the indices of the tokens rather the tokens themselves.

In [27]:
# The following function takes a list of sentences and a vocabulary and returns the indices of the tokens in the vocabulary
def get_inds_from_vocab(sentences_list, vocab):

    # Initialize the list of tokenized sentences
    tokenized_sentences_list = []

    # For each sentence in the list
    for sentence in sentences_list:

        # Tokenize the sentence
        tokenized_sentence = tokenizer(sentence)

        # TODO: Get the indices for the tokenized sentence using the forward method of the vocabulary
        tokenized_sentence_inds = vocab.forward(tokenized_sentence)
        
        # Append the indices to the list
        tokenized_sentences_list.append(tokenized_sentence_inds)

    return tokenized_sentences_list

# Get the tokens indices for the training set and the test set
train_tokenized_sentences_inds = get_inds_from_vocab(train_sentences, vocab)
test_tokenized_sentences_inds = get_inds_from_vocab(test_sentences, vocab)

After we tokenized the corpus, we set the sentences to a fixed length either by cutting long sentences or padding short sentences

In [28]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len] # padding from the left so that we will sample the last value of the sentence
    return features

seq_len = 200  # The length that the sentences will be padded/shortened to

# TODO: Use the function pad_input to pad the data sets
train_reviews = pad_input(train_tokenized_sentences_inds, seq_len=seq_len)
test_reviews = pad_input(test_tokenized_sentences_inds, seq_len=seq_len)


# Converting our labels into numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

Split the dataset to validation and test

In [29]:
split_frac = 0.5 # 50% validation, 50% test
split_id = int(split_frac * len(test_reviews))

# TODO: split the test data to validation (inds [:split_id]) and test (inds [split_id:])
val_reviews, test_reviews = test_reviews[:split_id] , test_reviews[split_id:]
val_labels, test_labels = test_labels[:split_id] , test_labels[split_id:]


Define parameters for training

In [30]:
batch_size = 400
epochs = 1
counter = 0
print_every = 250
clip = 5
valid_loss_min = np.Inf
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2
lr = 0.005
vocab_size = len(vocab) + 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Define datasets and dataloaders

In [31]:
train_data = TensorDataset(torch.from_numpy(train_reviews), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_reviews), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_reviews), torch.from_numpy(test_labels))

train_loader = DataLoader(train_data, num_workers=4, persistent_workers=True, shuffle=True, batch_size=batch_size, drop_last=True, pin_memory=True)
val_loader = DataLoader(val_data, num_workers=4, persistent_workers=True, shuffle=False, batch_size=6*batch_size, drop_last=False, pin_memory=True)
test_loader = DataLoader(test_data, shuffle=False, batch_size=6*batch_size, drop_last=False, pin_memory=True)

Define the neural network

In [37]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # TODO: Create an embedding layer form torch.nn of size vocab_size with dimension embedding_dim and set the padding_idx properly
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # TODO: Create an LSTM from torch.nn with n_layers and the parameters above. Note that batch_first should be set to true
        self.lstm = torch.nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)

        # Dropout layer
        self.dropout = nn.Dropout(drop_prob)

        # Final linear layer for classification
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x):

        # Recast to type long
        x = x.long()

        # TODO: Get the embeddings of the current batch
        embeds = self.embedding(x)

        # TODO: Process the batch using LSTM, here we can discard the returned hidden state
        lstm_out, _ = self.lstm(embeds)

        # Apply dropout
        out = self.dropout(lstm_out)

        # Apply the last layer for classification 
        out = self.fc(out).squeeze()

        return out[:,-1]

In [38]:
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Training

In [39]:
model.train()
for i in range(epochs):
    
    for inputs, labels in train_loader:
        counter += 1
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output = model(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            with torch.no_grad():
                val_losses = []
                model.eval()
                model.zero_grad()
                num_correct = 0
                for inp, lab in val_loader:
                    inp, lab = inp.to(device), lab.to(device)
                    out = model(inp)
                    val_loss = criterion(out.squeeze(), lab.float())
                    val_losses.append(val_loss.item())
                    pred = (out.squeeze()>0).int()
                    correct_tensor = pred.eq(lab.float().view_as(pred))
                    correct = np.squeeze(correct_tensor.cpu().numpy())
                    num_correct += np.sum(correct)

            val_acc = num_correct/len(val_loader.dataset)
            model.train()

            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}...".format(np.mean(val_losses)),
                  "Val accuracy: {:.3f}%".format(val_acc*100))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './best_model.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).\
                       Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 1/1... Step: 2250... Loss: 0.180460... Val Loss: 0.209140... Val accuracy: 91.786%
Validation loss decreased (0.683126 --> 0.209140).                       Saving model ...
Epoch: 1/1... Step: 2500... Loss: 0.171479... Val Loss: 0.181286... Val accuracy: 92.955%
Validation loss decreased (0.209140 --> 0.181286).                       Saving model ...
Epoch: 1/1... Step: 2750... Loss: 0.183301... Val Loss: 0.174152... Val accuracy: 93.285%
Validation loss decreased (0.181286 --> 0.174152).                       Saving model ...
Epoch: 1/1... Step: 3000... Loss: 0.184490... Val Loss: 0.167213... Val accuracy: 93.659%
Validation loss decreased (0.174152 --> 0.167213).                       Saving model ...
Epoch: 1/1... Step: 3250... Loss: 0.133886... Val Loss: 0.164325... Val accuracy: 93.770%
Validation loss decreased (0.167213 --> 0.164325).                       Saving model ...
Epoch: 1/1... Step: 3500... Loss: 0.149206... Val Loss: 0.155926... Val accuracy: 94.115%
Validation

Evaluate performance on the test set

In [40]:
# Loading the best model
model.load_state_dict(torch.load('./best_model.pt'))

test_losses = []
num_correct = 0
with torch.no_grad():
    model.eval()
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        output = model(inputs)
        test_loss = criterion(output.squeeze(), labels.float())
        test_losses.append(test_loss.item())
        pred = (output.squeeze()>0).int()
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.152
Test accuracy: 94.343%
