In [1]:
# Math and data
import numpy as np
import pandas as pd
import polars as pl
# Neural network frameworks
import torch as th
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, RobertaTokenizer
# Utilities
import re
from enum import Enum
import contractions as ct
import utility as util
import json
# Plotting
import matplotlib.pyplot as plt

# Pytorch device
device = th.device("mps") if th.backends.mps.is_available() else th.device("cuda") if th.cuda.is_available() else th.device("cpu")
if device.type == "cuda":
    print(th.cuda.get_device_name(device))
else:
    print(device)

  from .autonotebook import tqdm as notebook_tqdm


cpu


In [2]:
# instantiate roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# write dictionary to file
def write_dict_to_file(dictionary, file_name):
    with open(file_name, 'w') as f:
        json.dump(dictionary, f)

write_dict_to_file(tokenizer.get_vocab(), "roberta_vocab.txt")

In [3]:
# Use simple GPT2 tokenizer for counting tokens
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load dataset -> Prune dataset -> Tokenize dataset
df = util.load_dataset(util.Paths.gift, util.DatasetType.ORIGINAL)
df = util.prune(df)
util.save_dataset(df, util.Paths.gift, util.DatasetType.PRUNED)
df = util.tokenize(df, tokenizer)

# Find max token length of review text with numpy
max_review_len = np.max(list(df['reviewText'].apply(list).apply(len)))
print("\nMax token length of review text: ", max_review_len)
# Find max token length of summary with numpy
max_summary_len = np.max((list(df['summary'].apply(list).apply(len))))
print("Max token length of summary: ", max_summary_len)


Max token length of review text:  553
Max token length of summary:  27


In [4]:
# torch dataset from pandas dataframe
# defines a voacbulary of words and converts the review text to a list of indices
# beware of symbols like ., !, ? etc.
# pad the review text and summary to max_review_len and max_summary_len respectively

"""
ReviewDataset pytorch dataset interface
- expects a polars dataframe with columns reviewText, summary, overall
- expects it in the DatasetType.PRUNED format
- expects a GPT2Tokenizer
"""
class ReviewDataset(Dataset):
    def __init__(self, df: pl.DataFrame, tokenizer: GPT2Tokenizer, dataset_type = util.DatasetType.PRUNED, max_review_len = 2000, max_summary_len = 200, lower_case = True, device = "cpu"):
        self.df = util.load_dataset(util.Paths.gift, dataset_type)
        self.dataset_type = dataset_type
        self.max_review_len = max_review_len
        self.max_summary_len = max_summary_len
        self.tokenizer = tokenizer
        self.lower_case = lower_case
        self.device = device

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        review = self.df["reviewText"][idx]
        summary = self.df["summary"][idx]
        rating = th.tensor(self.df["overall"][idx])

        # Tokenize the review and summary strings
        review = self.tokenizer.encode(review, add_special_tokens = True, padding = "max_length", truncation = True, max_length=self.max_review_len, return_tensors = "pt").squeeze()
        summary = self.tokenizer.encode(summary, add_special_tokens = True, padding = "max_length", truncation = True, max_length=self.max_summary_len, return_tensors = "pt").squeeze()

        # move tensors to device
        review = review.to(self.device)
        summary = summary.to(self.device)
        rating = rating.to(self.device)
        
        return review, summary, rating
    
    def detokenize(self, x: th.Tensor):
        return self.tokenizer.decode(x, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [5]:
# Test the dataset
# Setup
t = GPT2Tokenizer.from_pretrained("gpt2", add_bos_token=True, add_prefix_space=True, trim_offsets=True)
t.pad_token = t.eos_token
t.add_special_tokens({"bos_token": util.BOS_token})

# Create the dataset
dataset = ReviewDataset(util.Paths.gift, t, max_review_len = max_review_len, max_summary_len = max_summary_len, lower_case = False, device=device)

data_idx = 45
# print(f"Review: {dataset[data_idx][0]}")

# decode
print(f"Review: {ReviewDataset.detokenize(dataset, dataset[data_idx][0])}")
print(f"Summary: {ReviewDataset.detokenize(dataset, dataset[data_idx][1])}")
print(f"Rating: {int(dataset[data_idx][2])}")

# max length is the max index of the vocabulary
MAX_LENGTH = len(t)
print(f"MAX_LENGTH: {MAX_LENGTH}")

Review:  what is to go wrong with a gift card. as long as it enters into a person's account as a credit it is just what you paid for.
Summary:  gift card
Rating: 1
MAX_LENGTH: 50258


In [6]:
"""
Model
"""

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return th.zeros(1, 1, self.hidden_size, device=device)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = nn.functional.softmax(
            self.attn(th.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = th.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        output = th.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = nn.functional.relu(output)
        output, hidden = self.gru(output, hidden)

        output = nn.functional.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return th.zeros(1, 1, self.hidden_size, device=device)

In [10]:
# Test the model with a single forward pass
hidden_size = 256
encoder = EncoderRNN(MAX_LENGTH, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, MAX_LENGTH, dropout_p=0.1).to(device)

# dl = DataLoader(dataset)
# dl_it = iter(dl)

# Take input from the dataset
input_tensor, target_tensor, rating_tensor = dataset[data_idx]
# print(input_tensor.get_device())

# Create the encoder hidden state
encoder_hidden = encoder.initHidden()

# Initialise the encoder output
encoder_outputs = th.zeros(MAX_LENGTH, encoder.hidden_size, device=device)

# Run the encoder
for token in input_tensor:
    # print(token)
    encoder_output, encoder_hidden = encoder(token, encoder_hidden)
    encoder_outputs[token] = encoder_output[0, 0]

print(f"Encoder output shape: {encoder_outputs.shape}")
bos = th.tensor(t.bos_token_id).to(device)

# Create the decoder input
decoder_input = th.tensor([bos], device=device, dtype=th.long) # We don't use any BOS token
# Create the decoder output
decoder_output = th.zeros(MAX_LENGTH, device=device, dtype=th.long)

print(f"Decoder output: {decoder_output[0]}")



# Create the decoder hidden state
decoder_hidden = encoder_hidden

# Run the decoder
for i, word in enumerate(target_tensor):
    decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
    topv, topi = decoder_output.topk(1)
    decoder_input = topi.squeeze().detach() # detach from history as input

    print(f"Decoder input: {decoder_input.item()}")
    print(f"Target: {word.item()}")
    print(f"t.eos_token_id: {t.eos_token_id}")

    # Append the output
    decoder_output[i] = decoder_input

    if decoder_input.item() == t.eos_token_id:
        print(f"EOS token found at {i}th iteration")
        break

print(f"Decoder output shape: {decoder_output.shape}")

# Print the output before detokenization
print(f"Output: {decoder_output.argmax(dim=1)}")

# Print the detokenized output
print(f"Detokenized output: {ReviewDataset.detokenize(dataset, decoder_output.argmax(dim=1))}")

Encoder output shape: torch.Size([50258, 256])
Decoder output: 0
Decoder input: 33730
Target: 50257
t.eos_token_id: 50256
Decoder input: 37042
Target: 6979
t.eos_token_id: 50256


IndexError: index 1 is out of bounds for dimension 0 with size 1