#RoBERTa-large

In [None]:
!pip install transformers datasets evaluate

In [None]:
from datasets import load_dataset
#load the preprocessed dataset
dataset = load_dataset('csv', data_files={'train': 'df_train.csv',
                                          'val': 'df_valid.csv',
                                              'test': 'df_test.csv'})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-2cec45cfe0e5b614/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-2cec45cfe0e5b614/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#preprocess function to tokenize the text
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length',
                                max_length=128,
                                truncation=True,
                                return_tensors="pt")

In [None]:
#dictionaries to map ids to labels and vice versa

id2label = {0: "phrase", 1: "passage", 2: "multi"}
label2id = {"phrase": 0, "passage": 1, "multi": 2}

In [None]:
#import the roberta large tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-large")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
#import the roberta large model and pass it to classification model of huggingface

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-large", num_labels=3, id2label=id2label, label2id=label2id
).to("cuda")

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [None]:
#tokenize the data in train, validation and test sets

tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2560 [00:00<?, ? examples/s]

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [None]:
#import the evaluation metrics - accuracy and f1

import evaluate
import numpy as np

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):

  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]
  accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]

  return {"f1": f1, "accuracy": accuracy}

In [None]:
#import the data collator
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

###RoBERTa-large training



In [None]:
import torch 
torch.cuda.empty_cache()

In [None]:
#defining the training arguments for fine-tuning the roberta-large model

training_args = TrainingArguments(
    output_dir="output3",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.809285,0.618924,0.660937
2,No log,0.744941,0.659161,0.710938
3,No log,0.798207,0.662632,0.7
4,0.732700,0.9267,0.654664,0.695312
5,0.732700,1.02375,0.663179,0.692187


TrainOutput(global_step=800, training_loss=0.5566632461547851, metrics={'train_runtime': 1123.8617, 'train_samples_per_second': 11.389, 'train_steps_per_second': 0.712, 'total_flos': 2982190438809600.0, 'train_loss': 0.5566632461547851, 'epoch': 5.0})

###RoBERTa-large inference


In [None]:
# Load trained model
model_path = "output3/checkpoint-800"
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=3, id2label=id2label, label2id=label2id
).to("cuda")

# Define test trainer
test_trainer = Trainer(model) 
# Make prediction
raw_pred, labels, metrics = test_trainer.predict(tokenized_data["test"]) 
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

In [None]:
test_f1 = f1_metric.compute(predictions=y_pred, references=labels, average="macro")["f1"]
test_f1

0.7302641134426479

In [None]:
test_accuracy = accuracy_metric.compute(predictions=y_pred, references=labels)["accuracy"]
test_accuracy

0.73875