In [2]:
from transformers import GPT2Model, GPT2Tokenizer, GPT2Config

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch as th
import polars as pl
# import dask.dataframe as dd

# define device
device = th.device("mps") if th.backends.mps.is_available() else th.device("cuda") if th.cuda.is_available() else th.device("cpu")

In [3]:
input = "a\"a"
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

vocab = tokenizer.get_vocab()
print("Vocab size: ", tokenizer.vocab_size)
# ouput vocab to file
f = open("vocab.txt", "w", encoding="utf-8")
f.write(str(vocab))

# Tokenize input
tokens = tokenizer(input)
print("Tokens: ", tokens)

Vocab size:  50257
Tokens:  {'input_ids': [64, 1, 64], 'attention_mask': [1, 1, 1]}


In [4]:
config = GPT2Config()
model = GPT2Model(config)

input_tensor = th.tensor([tokens['input_ids']])

output = model(input_tensor)

len(output[0][0][0])

768

In [28]:
# paths
data_dir = "datasets/"
data_all = "All_Amazon_Review_5.json"
data_video = "Amazon_Instant_Video_5.json"


# read data with pandas
df = pd.read_json(data_dir + data_video, lines=True)

# lower case all headers
df.columns = map(str.lower, df.columns)

# keep only the review text, rating, and summary
df = df[['reviewtext', 'overall', 'summary']]
print(df.head())

# find max length of review text with numpy
max_review_len = np.max(df['reviewtext'].apply(len))
print("\nMax length of review text: ", max_review_len)
# find max length of summary with numpy
max_summary_len = np.max(df['summary'].apply(len))
print("Max length of summary: ", max_summary_len)

                                          reviewtext  overall  \
0  I had big expectations because I love English ...        2   
1  I highly recommend this series. It is a must f...        5   
2  This one is a real snoozer. Don't believe anyt...        1   
3  Mysteries are interesting.  The tension betwee...        4   
4  This show always is excellent, as far as briti...        5   

                          summary  
0      A little bit boring for me  
1           Excellent Grown Up TV  
2           Way too boring for me  
3     Robson Green is mesmerizing  
4  Robson green and great writing  
I had big expectations because I love English TV, in particular Investigative and detective stuff but this guy is really boring. It didn't appeal to me at all.
28

Max length of review text:  18152
Max length of summary:  151


In [None]:
""" Prune a dataframe to only contain the columns we need """
# Drop all rows without a reviewtext or summary
df = df.dropna(subset=["reviewtext", "summary", "overall"])

# Write reviewtext, summary or overall to json file
df[["reviewtext", "summary", "overall"]].to_json("Pruned_Arts_Crafts_and_Sewing.json", orient="records", lines=True)

In [60]:
data = pd.read_json(data_dir + data_all, lines=True)

TypeError: read_json() got an unexpected keyword argument 'skiprows'

In [None]:
# torch dataset from pandas dataframe
# defines a voacbulary of words and converts the review text to a list of indices
# beware of symbols like ., !, ? etc.
# pad the review text and summary to max_review_len and max_summary_len respectively

class ReviewDataset(th.utils.data.Dataset):
    def __init__(self, path):


    def __len__(self):
        

    def __getitem__(self, idx):
        # lazy loading

        # move tensors to device
        review = review.to(device)
        rating = rating.to(device)
        summary = summary.to(device)
        
        return review, rating, summary