<a href="https://colab.research.google.com/github/Iispar/review-summary-API/blob/main/BERT-finetuned-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install -q transformers datasets evaluate
!pip install optuna
import datasets
import numpy as np
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import optuna

In [None]:
model = 'distilbert-base-cased'
dset = 'mteb/amazon_reviews_multi'

# Preprocessing

The dataset includes reviews from multiple languages so we only import the english ones. The dataset also includes alot of useless data for us, we only need the reviews and their ratings so lets process everything else out.

In [None]:
dataset = datasets.load_dataset(dset, name='en'); # imports the dataset.
# check it works
print(dataset);

In [None]:
engDataset = datasets.load_dataset(dset, name='en'); # imports the dataset.
# check it works
print(engDataset);

# FOR TESTING make the dataset smaller
# engDataset["train"] = engDataset["train"].select(range(100000))

In [None]:
engDataset = engDataset.shuffle() # shuffle the dataset for safety.
engDataset = engDataset.remove_columns(['id', 'label_text']) # removes everything that we don't need

# Tokenization and padding

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(model) # get the basic AutoTokenizer
# Used the BertTokenizer instead of AutoTokenizer, because we want the token type ids to be used for BERT.

In [None]:
# tokenizes one example
def tokenize_example(example):
    split = example['text'].split('\n\n'); # splits the sentace and title.
    return tokenizer.encode_plus(split[0], split[1], # input title and body seperately
             truncation='only_second', # only cut the second, which is the actual body of the review
             add_special_tokens=True, # add CLS and SEP
             max_length=512, # max len is same as BERTs
             padding='max_length') # pad to max length

In [None]:
# map the whole dset
eng_tokenized = engDataset.map(tokenize_example)

In [None]:
print(eng_tokenized['train'][1])
print(tokenizer.decode(eng_tokenized['train'][1]['input_ids']))

# looks good to me.

# Fine tuning the BERT model for our classification

In [None]:
# config
import torch
import torch.nn as nn

# Create the bert class
class Bert(nn.Module):
    def __init__(self):
        super(Bert, self).__init__()
        # hidden size of BERT (always 768), hidden size of our classifier, and number of labels (in this case 5)
        H_in, H, labels = 768, 25, 5

        # bert is our preloaded distilbert
        self.bert = DistilBertModel.from_pretrained(model)

        # basic one layer feed forward network that outputs the labels.
        self.classifier = nn.Sequential(
            nn.Linear(H_in, H), # bert in.
            nn.ReLU(), # ReLU
            #nn.Dropout(0.5), #dropout if needed.
            nn.Linear(H, labels) # to output labels
        )
    def forward(self, input_ids, attention_mask, labels=None):

        # run the BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        # Extract the last hidden state of the token for classification
        last_hidden_state = outputs[0][:, 0, :]

        # Feed tha last hidden state into the classifier. This outputs the labels.
        logits = self.classifier(last_hidden_state)

        # if there is labels so training
        if labels is not None:
          # calculates the loss.
          loss = torch.nn.CrossEntropyLoss();
          return (loss(logits,labels),logits);
        else:
          # if no labels, just return the logits
          return (logits,);

In [None]:
#calculates the accuracy
accuracy = evaluate.load('accuracy');
def compute_accuracy(outputs_and_labels):
    outputs, labels = outputs_and_labels;
    predictions = np.argmax(outputs, axis=-1); #pick the index of the "winning" label
    return accuracy.compute(predictions=predictions, references=labels); # calc accuracy

In [None]:
model = Bert() # init the model

In [None]:
# Because the bert is really large lets freeze its weights that we dont want to change.
# thisway we can get the model to train a bit faster and inside colab.

for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False

In [None]:
# Training params. We optimize these later.
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = 'steps',
    logging_strategy = 'steps',
    eval_steps = 500,
    logging_steps = 500,
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    max_steps = 20000,
    num_train_epochs=5,
    weight_decay=0.01,
  )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # get the data collator with our tokenizer.
early_stopping = transformers.EarlyStoppingCallback(3); # stop training if the eval loss is not getting better.

# Set the trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = eng_tokenized['train'],
    eval_dataset = eng_tokenized['test'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_accuracy,
)

# train the model
trainer.train()

# Hyperparam optimization

This happens over a couple of days so you won't see all the results...

In [None]:
# Used optuna for optimization

def objective(trial):
    # Define the search space for hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-7, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 4, 16])
    epochs=trial.suggest_int('num_train_epochs', low = 2,high = 6),

    # params
    trainer_args = transformers.TrainingArguments(
        "mlp_checkpoints",
        evaluation_strategy = "steps",
        logging_strategy = "steps",
        eval_steps = 500,
        logging_steps = 500,
        learning_rate = learning_rate,
        max_steps = 20000,
        load_best_model_at_end = True,
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size,
        num_train_epochs = epochs
    )

    # the model
    mlp = model
    early_stopping = transformers.EarlyStoppingCallback(3); # stop training if the eval loss is not getting better.

    # train a model
    trainer = transformers.Trainer(
        model = mlp,
        args = trainer_args,
        train_dataset = eng_tokenized['train'],
        eval_dataset = eng_tokenized['test'],
        compute_metrics = compute_accuracy,
        data_collator = data_collator,
        callbacks = [early_stopping]
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_accuracy"] # return the best result.

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)