# Import packages

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
!pip install transformers 
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 87.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 79.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.2-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 5

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, RobertaForSequenceClassification
from datasets import load_dataset, ClassLabel, Value, load_metric

# load source dataset

In [None]:
# sampling the dataset for fine-tuning
train = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train[:60%]') 

Downloading builder script:   0%|          | 0.00/7.45k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/195k [00:00<?, ?B/s]

Downloading and preparing dataset amazon_us_reviews/Video_v1_00 (download: 132.49 MiB, generated: 339.76 MiB, post-processed: Unknown size, total: 472.25 MiB) to /root/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563...


Downloading data:   0%|          | 0.00/139M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/380604 [00:00<?, ? examples/s]

Dataset amazon_us_reviews downloaded and prepared to /root/.cache/huggingface/datasets/amazon_us_reviews/Video_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563. Subsequent calls will reuse this data.


In [None]:
# take a look at a sample
train[0]

{'marketplace': 'US',
 'customer_id': '49033728',
 'review_id': 'R1P1G5KZ05H6RD',
 'product_id': '6302503213',
 'product_parent': '748506413',
 'product_title': 'The Night They Saved Christmas [VHS]',
 'product_category': 'Video',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 0,
 'verified_purchase': 1,
 'review_headline': 'Very satisfied!!',
 'review_body': 'Fast shipping. Pleasure to deal with. Would recommend. A+++. Thanks!',
 'review_date': '2015-08-31'}

# preprocessing

In [None]:
# remove unuseful columns
train = train.remove_columns(['marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date'])

In [None]:
train[0]

{'customer_id': '49033728',
 'product_id': '6302503213',
 'star_rating': 5,
 'review_headline': 'Very satisfied!!',
 'review_body': 'Fast shipping. Pleasure to deal with. Would recommend. A+++. Thanks!'}

# Encoding 

In [None]:
# load the tokenizer pretrained on roberta-base
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# encode the training dataset in the form of sentences pair
# truncate at length=32 for a balance of time consuming and information coverage
train_tokenized = train.map(lambda batch: tokenizer(batch['review_body'], padding='max_length', truncation=True, max_length=64))

  0%|          | 0/228362 [00:00<?, ?ex/s]

In [None]:
train_tokenized = train_tokenized.rename_column("star_rating", "labels")
train_tokenized = train_tokenized.rename_column("review_body", "text")

In [None]:
# convert star rating that ranging from 1-5 to labels that ranging from 0-4
def to_label(x):
    x['labels']  = x['labels'] - 1
    return x

train_tokenized = train_tokenized.map(to_label)

  0%|          | 0/228362 [00:00<?, ?ex/s]

In [None]:
train_tokenized.set_format("torch", columns=["input_ids",  "attention_mask", "labels"])

In [None]:
train_tokenized[0]

{'labels': tensor(4),
 'input_ids': tensor([    0, 35515,  6738,     4, 18486, 24669,     7,   432,    19,     4,
         11258,  5940,     4,    83, 42964, 27079,  4557,   328,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

# Fine-tuning

In [None]:
# load a pretrained model

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=5)

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
# set training arguments manually if needed, otherwise use the defalut
training_args = TrainingArguments(
    output_dir='./output',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,               # learning rate
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
metric = load_metric("accuracy")

In [None]:
# Create a Trainer object with the model, training arguments, training and test datasets, and evaluation function
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=train_tokenized)

In [None]:
# clean up gpu cache before training
import gc

gc.collect()

torch.cuda.empty_cache()

In [None]:
trainer.train()    
# trainer.train(resume_from_checkpoint=True) # True if already trained, to save time by continuing on a checkpoint

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, review_headline, product_id, customer_id. If text, review_headline, product_id, customer_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 228362
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 17845


Step,Training Loss
1000,0.8908
2000,0.8368
3000,0.8202
4000,0.7843
5000,0.7494
6000,0.748
7000,0.7398
8000,0.6763
9000,0.6733
10000,0.6605


Saving model checkpoint to ./output/checkpoint-500
Configuration saved in ./output/checkpoint-500/config.json
Model weights saved in ./output/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-1000
Configuration saved in ./output/checkpoint-1000/config.json
Model weights saved in ./output/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-1500
Configuration saved in ./output/checkpoint-1500/config.json
Model weights saved in ./output/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-2000
Configuration saved in ./output/checkpoint-2000/config.json
Model weights saved in ./output/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-2500
Configuration saved in ./output/checkpoint-2500/config.json
Model weights saved in ./output/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./output/checkpoint-3000
Configuration saved in ./output/checkpoint-3000/config.json
M

TrainOutput(global_step=17845, training_loss=0.6653989886462638, metrics={'train_runtime': 4826.9357, 'train_samples_per_second': 236.55, 'train_steps_per_second': 3.697, 'total_flos': 3.755386578085248e+16, 'train_loss': 0.6653989886462638, 'epoch': 5.0})

In [None]:
# save the fine_tuned model

model.save_pretrained("Roberta-senti")

Configuration saved in Roberta-senti/config.json
Model weights saved in Roberta-senti/pytorch_model.bin


# Predict on the whole dataset with the fine-tuned model

In [None]:
# load the entire dataset
dataset = load_dataset('amazon_us_reviews', 'Video_v1_00', split='train')

# remove unuseful columns
dataset = dataset.remove_columns(['marketplace', 'review_id', 'product_parent', 'product_title', 'product_category', \
                      'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date'])
    
# load the fine-tuned tokenizer  
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# encode the training dataset in the form of sentences pair
# truncate at length=32 for a balance of time consuming and information coverage
dataset_tokenized = dataset.map(lambda batch: tokenizer(batch['review_body'], padding='max_length', truncation=True, max_length=64))
dataset_tokenized = dataset_tokenized.rename_column("star_rating", "labels")
dataset_tokenized = dataset_tokenized.rename_column("review_body", "text")

# convert star rating that ranging from 1-5 to labels that ranging from 0-4
dataset_tokenized = dataset_tokenized.map(to_label)

dataset_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_

  0%|          | 0/380604 [00:00<?, ?ex/s]

  0%|          | 0/380604 [00:00<?, ?ex/s]

In [None]:
pred_output = trainer.predict(dataset_tokenized)

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, review_headline, product_id, customer_id. If text, review_headline, product_id, customer_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 380604
  Batch size = 64


In [None]:
# get prediction and evaluate
pred = pred_output[0].argmax(axis=1)
truth = pred_output[1]
accuracy = load_metric('accuracy')
f1 = load_metric('f1')
accuracy.compute(predictions=pred, references=truth)

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'accuracy': 0.7869123813727654}

In [None]:
f1.compute(predictions=pred, references=truth, average='weighted')

{'f1': 0.779370249430155}

# Save the dataset expanded by rating predicted by sentiment analysis

In [None]:
# expand the original dataset with the predicted rating
data = dataset[:]
data['senti_rating_finetune'] = pred + 1

In [None]:
df = pd.DataFrame.from_dict(data)

# extract variables needed for CF recommender
df= df[['customer_id', 'product_id', 'star_rating', 'senti_rating_finetune']]
df.rename(columns = {'customer_id' : 'user', 'product_id' : 'item', 'star_rating' : 'rating'}, inplace = True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,user,item,rating,senti_rating_finetune
0,49033728,6302503213,5,5
1,17857748,B000059PET,5,5
2,25551507,0788812807,4,5
3,21025041,6302509939,5,5
4,40943563,B00JENS2BI,3,3


In [None]:
# save as csv files
df.to_csv('../amazon_video_roberta.csv')