## Fine-Tuning T5 for Product Review Generation

In [2]:
## required libraries to install

!pip install numpy==1.25.1
!pip install transformers[torch]
!pip install datasets===2.13.1

Collecting numpy==1.25.1
  Downloading numpy-1.25.1-cp310-cp310-win_amd64.whl.metadata (5.7 kB)
Downloading numpy-1.25.1-cp310-cp310-win_amd64.whl (15.0 MB)
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/15.0 MB 3.8 MB/s eta 0:00:04
   - -------------------------------------- 0.5/15.0 MB 3.5 MB/s eta 0:00:05
   -- ------------------------------------- 1.1/15.0 MB 5.7 MB/s eta 0:00:03
   ---- ----------------------------------- 1.8/15.0 MB 7.7 MB/s eta 0:00:02
   ---- ----------------------------------- 1.8/15.0 MB 6.5 MB/s eta 0:00:03
   ----- ---------------------------------- 2.0/15.0 MB 6.8 MB/s eta 0:00:02
   ------ --------------------------------- 2.5/15.0 MB 6.6 MB/s eta 0:00:02
   ------- -------------------------------- 3.0/15.0 MB 7.0 MB/s eta 0:00:02
   -------- ------------------------------- 3.1/15.0 MB 7.2 MB/s eta 0:00:02
   ---------- ----------------------------- 3.8/15.0 MB 7.2 MB/s eta 0:00:02
   

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 1.25.1 which is incompatible.


Collecting datasets===2.13.1
  Downloading datasets-2.13.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=8.0.0 (from datasets===2.13.1)
  Downloading pyarrow-16.1.0-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Collecting xxhash (from datasets===2.13.1)
  Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets===2.13.1)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets===2.13.1)
  Downloading aiohttp-3.9.5-cp310-cp310-win_amd64.whl.metadata (7.7 kB)
Collecting huggingface-hub<1.0.0,>=0.11.0 (from datasets===2.13.1)
  Downloading huggingface_hub-0.23.2-py3-none-any.whl.metadata (12 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets===2.13.1)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->datasets===2.13.1)
  Downloading frozenlist-1.4.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting multidict<7.0,>=4.5 

In [3]:
## modules to utilize the T5 model

import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

In [4]:
dataset_category = "Software" 
meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{dataset_category}", split='full').to_pandas()[['parent_asin', 'title']]
review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{dataset_category}", split='full').to_pandas()[['parent_asin', 'rating', 'text', 'verified_purchase']]

ds = meta_ds.merge(review_ds, on='parent_asin', how='inner').drop(columns="parent_asin")
ds = ds.rename(columns={"rating":"star_rating", "title":"product_title", "text":"review_body"})

ds = ds[ds['verified_purchase'] & (ds['review_body'].map(len) > 100)].sample(100_000)
ds

Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

Downloading and preparing dataset amazon-reviews-2023/raw_meta_Software to C:/Users/neo/.cache/huggingface/datasets/McAuley-Lab___amazon-reviews-2023/raw_meta_Software/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8...


Downloading data:   0%|          | 0.00/256M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Dataset amazon-reviews-2023 downloaded and prepared to C:/Users/neo/.cache/huggingface/datasets/McAuley-Lab___amazon-reviews-2023/raw_meta_Software/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8. Subsequent calls will reuse this data.
Downloading and preparing dataset amazon-reviews-2023/raw_review_Software to C:/Users/neo/.cache/huggingface/datasets/McAuley-Lab___amazon-reviews-2023/raw_review_Software/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8...


Downloading data:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Dataset amazon-reviews-2023 downloaded and prepared to C:/Users/neo/.cache/huggingface/datasets/McAuley-Lab___amazon-reviews-2023/raw_review_Software/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8. Subsequent calls will reuse this data.


  table = cls._concat_blocks(blocks, axis=0)


Unnamed: 0,product_title,star_rating,review_body,verified_purchase
2068009,ES File Explorer File Manager,3.0,It seems to only be capable of recognizing a f...,True
4343186,Bubble Witch 2 Saga,5.0,This game has change over the years and its s...,True
2496416,[Old Version] TurboTax Deluxe Federal + E-File...,5.0,This is the first time in a long time that I r...,True
1975936,GraveStompers: Kid Zombies Save Eradiction!,4.0,"Overall a good game although it is a ""freemium...",True
2666854,Max,1.0,This has been an awful experience. I love all ...,True
...,...,...,...,...
145526,FileMaker Pro 18 Advanced Mac/Win V18,1.0,I could not import or transfer my older record...,True
3591585,Scribblenauts Remix,3.0,At 99 cents it's a great purchase for your kid...,True
221449,Dr. Panda Veggie Garden,4.0,My girls love all of the Dr. Panda games. Thes...,True
2515441,Minion Rush: Running game,5.0,I absolutely loooove this game!!! it's very ad...,True


In [7]:
# loading the dataset
dataset = Dataset.from_pandas(ds)

# encoding the 'star_rating' column
dataset = dataset.class_encode_column("star_rating")

# Splitting the dataset into training and testing sets
dataset = dataset.train_test_split(test_size=0.1, seed=42, stratify_by_column="star_rating")

train_dataset = dataset['train']
test_dataset = dataset['test']
print(train_dataset[0])

Stringifying the column:   0%|          | 0/100000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/100000 [00:00<?, ? examples/s]

{'product_title': 'Photo Lab PRO photo editor', 'star_rating': 4, 'review_body': "Love it it's awesome I can spend hours editing pictures. I highly recommend  this app to any photographer  too", 'verified_purchase': True, '__index_level_0__': 4357427}


In [9]:
MODEL_NAME = 't5-base'
tokenizer = T5Tokenizer.from_pretrained('t5-base')

ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
# Defining the function to preprocess the data
def preprocess_data(examples):
    examples['prompt'] = [f"review: {example['product_title']}, {example['star_rating']} Stars!" for example in examples]
    examples['response'] = [f"{example['review_headline']} {example['review_body']}" for example in examples]

    inputs = tokenizer(examples['prompt'], padding='max_length', truncation=True, max_length=128)
    targets = tokenizer(examples['response'], padding='max_length', truncation=True, max_length=128)

    # Set -100 at the padding positions of target tokens
    target_input_ids = []
    for ids in targets['input_ids']:
        target_input_ids.append([id if id != tokenizer.pad_token_id else -100 for id in ids])

    inputs.update({'labels': target_input_ids})
    return inputs

In [None]:
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# proceeding to fine-tune the T5 model on our dataset

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

TRAINING_OUTPUT = "./models/t5_fine_tuned_reviews"
training_args = TrainingArguments(
    output_dir=TRAINING_OUTPUT,
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    save_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()

In [10]:
trainer.save_model(TRAINING_OUTPUT) # demonstrating on how to use it by saving it for later use

NameError: name 'trainer' is not defined

In [None]:
# Loading the fine-tuned model
model = T5ForConditionalGeneration.from_pretrained(TRAINING_OUTPUT)

# or get it directly trained from here:
# model = T5ForConditionalGeneration.from_pretrained("TheFuzzyScientist/T5-base_Amazon-product-reviews")

In [None]:
## we use our fine-tuned model to generate reviews for new products.
# Defining the function to generate reviews

def generate_review(text):
    inputs = tokenizer("review: " + text, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
    outputs = model.generate(inputs['input_ids'], max_length=128, no_repeat_ngram_size=3, num_beams=6, early_stopping=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [None]:
# Generating reviews for random products
random_products = test_dataset.shuffle(42).select(range(10))['product_title']

print(generate_review(random_products[0] + ", 3 Stars!"))
print(generate_review(random_products[1] + ", 5 Stars!"))
print(generate_review(random_products[2] + ", 2 Stars!"))