In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch as th

In [74]:
# define device
device = th.device("mps") if th.backends.mps.is_available() else th.device("cuda") if th.cuda.is_available() else th.device("cpu")

# print device properties according to device type
if device.type == "cuda":
    print(th.cuda.get_device_name(device))
elif device.type == "mps":
    print(th.backends.mps.get_device_name(device))
elif device.type == "cpu":
    print("CPU")

NVIDIA GeForce RTX 2070 SUPER


In [75]:
# paths
data_dir = "datasets/"
data_all = "All_Amazon_Review_5.json"
data_video = "Amazon_Instant_Video_5.json"

# read data with pandas
df = pd.read_json(data_dir + data_video, lines=True)

# lower case all headers
df.columns = map(str.lower, df.columns)

# keep only the review text, rating, and summary
df = df[['reviewtext', 'overall', 'summary']]
print(df.head())

# find max length of review text with numpy
max_review_len = np.max(df['reviewtext'].apply(len))
print("\nMax length of review text: ", max_review_len)
# find max length of summary with numpy
max_summary_len = np.max(df['summary'].apply(len))
print("Max length of summary: ", max_summary_len)


                                          reviewtext  overall  \
0  I had big expectations because I love English ...        2   
1  I highly recommend this series. It is a must f...        5   
2  This one is a real snoozer. Don't believe anyt...        1   
3  Mysteries are interesting.  The tension betwee...        4   
4  This show always is excellent, as far as briti...        5   

                          summary  
0      A little bit boring for me  
1           Excellent Grown Up TV  
2           Way too boring for me  
3     Robson Green is mesmerizing  
4  Robson green and great writing  

Max length of review text:  18152
Max length of summary:  151


In [76]:
# torch dataset from pandas dataframe
# defines a voacbulary of words and converts the review text to a list of indices

In [95]:
# torch dataset from pandas dataframe
# defines a voacbulary of words and converts the review text to a list of indices
# beware of symbols like ., !, ? etc.
# pad the review text and summary to max_review_len and max_summary_len respectively

class ReviewDataset(th.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.vocab = set()

        # call the function to create the vocabulary
        self.create_vocab()

        self.vocab = sorted(self.vocab)
        self.vocab2idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx2vocab = {i: word for i, word in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
    
    def create_vocab(self):
        # create the shared vocabulary
        for review in self.df['reviewtext']:
            for word in review.split():
                self.vocab.add(word)
        for summary in self.df['summary']:
            for word in summary.split():
                self.vocab.add(word)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        review = self.df.iloc[idx, 0].split()
        review = [self.vocab2idx[word] for word in review]
        review = th.tensor(review, dtype=th.long)
        review = th.nn.functional.pad(review, (0, max_review_len - len(review)))
        rating = self.df.iloc[idx, 1]
        rating = th.tensor(rating, dtype=th.long)
        summary = self.df.iloc[idx, 2].split()
        summary = [self.vocab2idx[word] for word in summary]
        summary = th.tensor(summary, dtype=th.long)
        summary = th.nn.functional.pad(summary, (0, max_summary_len - len(summary)))

        # move tensors to device
        review = review.to(device)
        rating = rating.to(device)
        summary = summary.to(device)
        
        return review, rating, summary

In [85]:
# test the dataset
dataset = ReviewDataset(df)
print(dataset[0])

(tensor([ 45253, 122627,  88635,  ...,      0,      0,      0], device='cuda:0'), tensor(2, device='cuda:0'), tensor([ 19675, 136056,  88914,  90146, 117201, 138804,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
          

In [86]:
"""
Model
uses context aware word embedding
multi-task network

Input: takes in a review string
Task 1: output a summary string of the input review with a max length defined by the dataset
Task 2: output a rating of the input review as a float 0-1

Use an encoder decoder setup with one decoder for each task
"""
class Summariser(th.nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_review_len, max_summary_len):
        super(Summariser, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.max_review_len = max_review_len
        self.max_summary_len = max_summary_len
        self.embedding = th.nn.Embedding(vocab_size, embedding_dim)
        self.encoder = th.nn.LSTM(embedding_dim, embedding_dim, num_layers=2, bidirectional=True, batch_first=True)
        self.decoder1 = th.nn.LSTM(embedding_dim, embedding_dim, num_layers=2, batch_first=True)
        self.decoder2 = th.nn.LSTM(embedding_dim, embedding_dim, num_layers=2, batch_first=True)
        self.linear1 = th.nn.Linear(embedding_dim, vocab_size)
        self.linear2 = th.nn.Linear(embedding_dim, 1)
        self.softmax = th.nn.Softmax(dim=2)
        self.sigmoid = th.nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.encoder(x)
        x1, _ = self.decoder1(x)
        x2, _ = self.decoder2(x)
        x1 = self.linear1(x1)
        x1 = self.softmax(x1)
        x2 = self.linear2(x2)
        x2 = self.sigmoid(x2)
        return x1, x2

In [100]:
"""
Dataset preparation
Use the ReviewDataset to create a DataLoader
Splitting the train, validation, and test sets
"""
# initialise the dataset
dataset = ReviewDataset(df)
dataset_size = len(dataset)

# shrink dataset for testing
dataset_size = 500
dataset = th.utils.data.Subset(dataset, range(dataset_size))

# split the dataset
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size
train_dataset, val_dataset, test_dataset = th.utils.data.random_split(dataset, [train_size, val_size, test_size])

# create the dataloaders
batch_size = 32
train_loader = th.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = th.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = th.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [101]:
# test the dataloader
train_loader_iter = iter(train_loader)
x, y, z = next(train_loader_iter)
print(x.shape, y.shape, z.shape)


torch.Size([32, 18152]) torch.Size([32]) torch.Size([32, 151])


In [102]:
"""
Training
"""
# initialise the model
# take into account if it is a subset of the dataset
model = Summariser(dataset.dataset.vocab_size, 256, max_review_len, max_summary_len)
model = model.to(device)

# define the loss functions
loss_fn1 = th.nn.CrossEntropyLoss()
loss_fn2 = th.nn.BCELoss()

# define the optimiser
optimiser = th.optim.Adam(model.parameters(), lr=0.001)

# define the number of epochs
epochs = 10

# train the model
for epoch in range(epochs):
    for review, rating, summary in train_loader:
        # zero the gradients
        optimiser.zero_grad()

        # forward pass
        y_pred1, y_pred2 = model(review)

        # calculate the loss
        loss1 = loss_fn1(y_pred1, summary)
        loss2 = loss_fn2(y_pred2, rating.unsqueeze(1).float())
        loss = loss1 + loss2

        # backward pass
        loss.backward()

        # update the weights
        optimiser.step()

    # print the loss
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')


RuntimeError: CUDA out of memory. Tried to allocate 6.69 GiB (GPU 0; 8.00 GiB total capacity; 3.93 GiB already allocated; 0 bytes free; 7.13 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF