<a href="https://colab.research.google.com/github/Iispar/review-summary-API/blob/main/BERT-finetuned-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip3 install -q transformers datasets evaluate
!pip install optuna
import datasets
import numpy as np
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Preprocessing

The dataset includes reviews from multiple languages so we only import the english ones. The dataset also includes alot of useless data for us, we only need the reviews and their ratings so lets process everything else out.

In [33]:
dataset = datasets.load_dataset('amazon_reviews_multi', name='en'); # imports the dataset.
# check it works
print(dataset);



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})


In [34]:
dataset = dataset.shuffle() # shuffle the dataset for safety.
dataset = dataset.remove_columns(['review_id', 'product_id', 'reviewer_id', 'language', 'product_category']) # removes everything that we don't need
dataset = dataset.rename_column('stars', 'label') # rename stars to label so it is a bit more understandable
# an error was coming up because of the labels were 1-5 and not 0-4 so let's change that for all.
# at the same time lets add the title to the start of the review with an :.

def addTitle_and_changeLables(example):
  example['label'] = example['label'] - 1; # lower the label by one so we get 0-4
  example['review_body'] = f"{example['review_title']}: {example['review_body']}"; # add title to review body
  return example # return the item
dataset = dataset.map(addTitle_and_changeLables) # map the function to all.
dataset = dataset.remove_columns(['review_title']) # now we can also remove the title

# let's check that it worked.
print(dataset)
print(dataset['train'][3]) 

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'review_body'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['label', 'review_body'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['label', 'review_body'],
        num_rows: 5000
    })
})
{'label': 0, 'review_body': "Sometimes hard to type through: Didn't stay on , wouldn't buy again"}


# Tokenization and padding

In [35]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') # get the basic AutoTokenizer 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # get the data collator for the padding and set the tokenizer as ours.

In [36]:
def preprocess_function(examples):
    return tokenizer(examples['review_body'], truncation=True) # tokenizes one example

In [37]:
dset_tokenized = dataset.map(preprocess_function, batched=True) # tokenize the whole dataset with map
print(dset_tokenized['train'][0]) # lets check that it worked

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

{'label': 1, 'review_body': 'HUGE: These clickers are HUGE. I can only wear the smallest one, the other two look ridiculous.', 'input_ids': [101, 4121, 1024, 2122, 11562, 2545, 2024, 4121, 1012, 1045, 2064, 2069, 4929, 1996, 10479, 2028, 1010, 1996, 2060, 2048, 2298, 9951, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [38]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5) # load the bert model with weights

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

# Fine tuning the BERT model for our classification

In [39]:
# evaluation
accuracy = evaluate.load('accuracy');
def compute_accuracy(outputs_and_labels):
    outputs, labels = outputs_and_labels;
    predictions = np.argmax(outputs, axis=-1); #pick the index of the "winning" label
    return accuracy.compute(predictions=predictions, references=labels); # calc accuracy

In [None]:


# Training params. We optimize these later
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = 'steps',
    logging_strategy = 'steps',
    eval_steps = 500,
    logging_steps = 500,
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps = 20000,
    num_train_epochs=5,
    weight_decay=0.01,
  )

early_stopping = transformers.EarlyStoppingCallback(5); # stop training if the eval loss is not getting better.

# Set the trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dset_tokenized['train'],
    eval_dataset = dset_tokenized['test'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_accuracy,
)

# train the model
trainer.train()

# Hyperparam optimization

This happens over a couple of days so you won't see all the results...

In [40]:
# Used optuna for optimization

def objective(trial):
    # Define the search space for hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-7, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 64, 128])
    epochs=trial.suggest_int('num_train_epochs', low = 2,high = 6),

    # params
    trainer_args = transformers.TrainingArguments(
        "mlp_checkpoints",
        evaluation_strategy = "steps",
        logging_strategy = "steps",
        eval_steps = 500,
        logging_steps = 500,
        learning_rate = learning_rate,
        max_steps = 30000,
        load_best_model_at_end = True,
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size,
        num_train_epochs = epochs
    )

    # the model
    mlp = model
    early_stopping = transformers.EarlyStoppingCallback(5); # stop training if the eval loss is not getting better.

    # train a model
    trainer = transformers.Trainer(
        model = mlp,
        args = trainer_args,
        train_dataset = dset_tokenized["train"],
        eval_dataset = dset_tokenized["validation"],
        compute_metrics = compute_accuracy,
        data_collator = data_collator,
        callbacks = [early_stopping]
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_accuracy"] # return the best result.

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[32m[I 2023-04-24 15:33:28,746][0m A new study created in memory with name: no-name-07897296-f2fc-45a1-9864-9344d5d35c7f[0m
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,1.5072,1.3316,0.3848
1000,1.3625,1.376773,0.3692
1500,1.5437,1.618834,0.2
2000,1.6127,1.615216,0.2
2500,1.6112,1.609785,0.2
3000,1.6104,1.610209,0.2


[32m[I 2023-04-24 15:41:07,138][0m Trial 0 finished with value: 0.3848 and parameters: {'learning_rate': 0.00021164622620120868, 'batch_size': 8, 'num_train_epochs': 5}. Best is trial 0 with value: 0.3848.[0m


Step,Training Loss,Validation Loss,Accuracy
500,1.199,1.146001,0.4962
1000,1.1182,1.082682,0.531
1500,1.0753,1.057752,0.553
2000,1.0502,1.023192,0.575
2500,1.0227,1.040394,0.5506
3000,1.0165,1.004728,0.5902
3500,0.9939,1.018587,0.592
4000,0.9993,0.980134,0.5948
4500,1.0044,0.968593,0.6012
5000,0.9833,0.975104,0.594


[33m[W 2023-04-24 16:47:18,506][0m Trial 1 failed with parameters: {'learning_rate': 4.142055909050997e-05, 'batch_size': 16, 'num_train_epochs': 3} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-40-3f32e04b177e>", line 39, in objective
    trainer.train()
  File "/usr/local/lib/python3.9/dist-packages/transformers/trainer.py", line 1662, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.9/dist-packages/transformers/trainer.py", line 2004, in _inner_training_loop
    self.control = self.callback_handler.on_step_end(args, self.state, self.control)
  File "/usr/local/lib/python3.9/dist-packages/transformers/trainer_callback.py", line 375, in on_step_end
    return self.call_event("on_step_end", args, state, control)
  File "/usr/local/lib/python3.9/dist-packages/tran

KeyboardInterrupt: ignored