In [None]:
! pip install transformers datasets accelerate

In [2]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
EPOCHS = 4
model_name = "roberta-base"
DATASET = 'sst2'

if 'roberta' in model_name:
  BASE_PATH = f"drive/MyDrive/DecompX/roberta/{DATASET}"
elif 'bert' in model_name:
  BASE_PATH = f"drive/MyDrive/DecompX/bert/{DATASET}"


MODEL_PATH = f"{BASE_PATH}/model/"

In [5]:
if 'roberta' in model_name:
  model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
  tokenizer = RobertaTokenizer.from_pretrained(model_name)
elif 'bert' in model_name:
  model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
  tokenizer = BertTokenizer.from_pretrained(model_name)

model = model.to("cuda" if torch.cuda.is_available() else "cpu")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [6]:
from datasets import load_dataset

train_dataset = load_dataset('glue', DATASET, split='train')
valid_dataset = load_dataset('glue', DATASET, split='validation')

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [7]:
# Tokenize the datasets
train_tokenized = train_dataset.map(lambda x: tokenizer(x['sentence'], truncation=True), batched=True)
valid_tokenized = valid_dataset.map(lambda x: tokenizer(x['sentence'], truncation=True), batched=True)

# Remove unnecessary columns, rename label column, and set format to PyTorch
train_tokenized = train_tokenized.remove_columns(['idx', 'sentence'])
train_tokenized = train_tokenized.rename_column('label', 'labels')
train_tokenized = train_tokenized.with_format('pt')

valid_tokenized = valid_tokenized.remove_columns(['idx', 'sentence'])
valid_tokenized = valid_tokenized.rename_column('label', 'labels')
valid_tokenized = valid_tokenized.with_format('pt')

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [8]:
per_device_train_batch_size = 16
steps_per_epoch = len(train_tokenized) // per_device_train_batch_size
print(f"{steps_per_epoch = }")

training_args = TrainingArguments(
  output_dir=MODEL_PATH,
  num_train_epochs=EPOCHS,
  per_device_train_batch_size=per_device_train_batch_size,
  evaluation_strategy="steps",
  eval_steps=700,
  logging_dir=".",
  save_steps=steps_per_epoch,
  save_total_limit=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    data_collator=data_collator,
    compute_metrics=lambda pred: {"accuracy": accuracy_score(pred.label_ids, pred.predictions.argmax(axis=1))},
)

trainer.train()

steps_per_epoch = 4209


Step,Training Loss,Validation Loss,Accuracy
700,0.3864,0.343691,0.881881
1400,0.3334,0.254635,0.917431
2100,0.3065,0.271784,0.902523
2800,0.2794,0.309945,0.912844
3500,0.2751,0.250366,0.911697
4200,0.2495,0.326126,0.904817
4900,0.2508,0.370717,0.91055
5600,0.2023,0.294638,0.915138
6300,0.1941,0.331438,0.908257
7000,0.2018,0.345009,0.902523


TrainOutput(global_step=16840, training_loss=0.18496347006029867, metrics={'train_runtime': 2507.1178, 'train_samples_per_second': 107.452, 'train_steps_per_second': 6.717, 'total_flos': 4963318726090560.0, 'train_loss': 0.18496347006029867, 'epoch': 4.0})

In [9]:
results = trainer.evaluate(valid_tokenized)
print(results)

{'eval_loss': 0.29034125804901123, 'eval_accuracy': 0.930045871559633, 'eval_runtime': 3.2267, 'eval_samples_per_second': 270.245, 'eval_steps_per_second': 33.781, 'epoch': 4.0}
