<a href="https://colab.research.google.com/github/Iispar/review-summary-API/blob/main/BERT-finetuned-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install -q transformers datasets evaluate
!pip install optuna
!pip install accelerate -U
import datasets
import numpy as np
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import BertModel
import evaluate
import optuna



In [2]:
model = 'bert-base-cased'
dset = 'mteb/amazon_reviews_multi'

# Preprocessing

The dataset includes reviews from multiple languages so we only import the english ones. The dataset also includes alot of useless data for us, we only need the reviews and their ratings so lets process everything else out.

In [3]:
dataset = datasets.load_dataset(dset, name='en'); # imports the dataset.
# check it works
print(dataset);

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
})


In [4]:
engDataset = datasets.load_dataset(dset, name='en'); # imports the dataset.
# check it works
print(engDataset);

# FOR TESTING make the dataset smaller
# engDataset["train"] = engDataset["train"].select(range(100000))

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
})


In [5]:
engDataset = engDataset.shuffle() # shuffle the dataset for safety.
engDataset = engDataset.remove_columns(['id', 'label_text']) # removes everything that we don't need

# Tokenization and padding

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model) # get the basic AutoTokenizer

In [7]:
# tokenizes one example
def tokenize_example(example):
    split = example['text'].split('\n\n'); # splits the sentace and title.
    return tokenizer.encode_plus(split[0], split[1],
             truncation='only_second',
             add_special_tokens=True,
             return_attention_mask=True,
             return_overflowing_tokens=False,
             return_special_tokens_mask=False,
             max_length=512,
             pad_to_max_length=False)


In [8]:
# map the whole dset
eng_tokenized = engDataset.map(tokenize_example)

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [9]:
print(eng_tokenized['train'][1])
print(tokenizer.decode(eng_tokenized['train'][1]['input_ids']))

# looks good to me.

{'text': 'Will do the job with the right considerations\n\nHandsome looking hooks. Unlike others I managed to get them nearly level and still have the gun level. The finish is mediocre but still acceptable and looks nice as it is. The hooks are padded as well.', 'label': 4, 'input_ids': [101, 3100, 1202, 1103, 2261, 1114, 1103, 1268, 19069, 102, 13719, 6758, 1702, 21717, 119, 5472, 1639, 146, 2374, 1106, 1243, 1172, 2212, 1634, 1105, 1253, 1138, 1103, 2560, 1634, 119, 1109, 3146, 1110, 1143, 13447, 13782, 1133, 1253, 12095, 1105, 2736, 3505, 1112, 1122, 1110, 119, 1109, 21717, 1132, 22862, 1112, 1218, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] Will

# Fine tuning the BERT model for our classification

In [10]:
# config
import torch
import torch.nn as nn

# Create the bert class
class Bert(nn.Module):
    def __init__(self):
        super(Bert, self).__init__()
        # hidden size of BERT (always 768), hidden size of our classifier, and number of labels (in this case 5)
        H_in, H, labels = 768, 25, 5

        # bert is our preloaded distilbert
        self.bert = BertModel.from_pretrained(model)

        # basic one layer feed forward network that outputs the labels.
        self.classifier = nn.Sequential(
            nn.Linear(H_in, H), # bert in.
            nn.ReLU(), # ReLU
            #nn.Dropout(0.5), #dropout if needed.
            nn.Linear(H, labels) # to output labels
        )
    def forward(self, input_ids, attention_mask, labels=None):

        # run the BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        # Extract the last hidden state of the token for classification
        last_hidden_state = outputs[0][:, 0, :]

        # Feed tha last hidden state into the classifier. This outputs the labels.
        logits = self.classifier(last_hidden_state)

        # if there is labels so training
        if labels is not None:
          # calculates the loss.
          loss = torch.nn.CrossEntropyLoss();
          return (loss(logits,labels),logits);
        else:
          # if no labels, just return the logits
          return (logits,);

In [11]:
#calculates the accuracy
accuracy = evaluate.load('accuracy');
def compute_accuracy(outputs_and_labels):
    outputs, labels = outputs_and_labels;
    predictions = np.argmax(outputs, axis=-1); #pick the index of the "winning" label
    return accuracy.compute(predictions=predictions, references=labels); # calc accuracy

In [12]:
model = Bert() # init the model

In [13]:
# Training params. We optimize these later.
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = 'steps',
    logging_strategy = 'steps',
    eval_steps = 500,
    logging_steps = 500,
    learning_rate=1.2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    max_steps = 20000,
    num_train_epochs=5,
    weight_decay=0.01,
  )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # get the data collator with our tokenizer.
early_stopping = transformers.EarlyStoppingCallback(3); # stop training if the eval loss is not getting better.

# Set the trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = eng_tokenized['train'],
    eval_dataset = eng_tokenized['test'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_accuracy,
)

# train the model
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,1.2244,1.015069,0.5648
1000,1.0126,0.92498,0.6112
1500,0.9163,0.908519,0.6244
2000,0.9159,0.90178,0.6256
2500,0.8904,0.879647,0.632
3000,0.8896,0.892748,0.625
3500,0.8721,0.909754,0.6094
4000,0.8607,0.863687,0.6362


KeyboardInterrupt: ignored

# Hyperparam optimization

This happens over a couple of days so you won't see all the results...

In [None]:
# Used optuna for optimization

def objective(trial):
    # Define the search space for hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16])
    epochs=trial.suggest_int('num_train_epochs', low = 2,high = 6),

    # params
    trainer_args = transformers.TrainingArguments(
        "mlp_checkpoints",
        evaluation_strategy = "steps",
        logging_strategy = "steps",
        eval_steps = 500,
        logging_steps = 500,
        learning_rate = learning_rate,
        max_steps = 20000,
        load_best_model_at_end = True,
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size,
        num_train_epochs = epochs
    )

    # the model
    mlp = model
    early_stopping = transformers.EarlyStoppingCallback(3); # stop training if the eval loss is not getting better.

    # train a model
    trainer = transformers.Trainer(
        model = mlp,
        args = trainer_args,
        train_dataset = eng_tokenized['train'],
        eval_dataset = eng_tokenized['test'],
        compute_metrics = compute_accuracy,
        data_collator = data_collator,
        callbacks = [early_stopping]
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_accuracy"] # return the best result.

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2023-10-24 15:03:59,777] A new study created in memory with name: no-name-c8391e5e-9002-4a23-9d05-9a3ae8be2cc9


Step,Training Loss,Validation Loss
