### Prepocess the Dataset

In [1]:
# import modules
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

# import raw dataset, set the checkpoint for tokenizer
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

#get the tokenized dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# prepare the dataset for training, remove unwanted columns that is useless for the training

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")




In [2]:
#check the tokenized datasets

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [3]:
#define the dataloader

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

### Train the Model

In [4]:
import torch
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

# set the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)



device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='cuda')

In [5]:
# from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm.auto import tqdm

# instantiaties Accelerator object to look at environment and initialise proper distributed setup
# accelerator = Accelerator()

# set the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# set the optimizer and learning rate for training
optimizer = AdamW(model.parameters(), lr=3e-5)

# define device to CUDA gpu and put our model and batches on it
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# send the training dataset, eval dataset, model and optimizer to accelerate container
# train_dl, eval_dl, model, optimizer = accelerator.prepare(
#    train_dataloader, eval_dataloader, model, optimizer
# )

# set the training steps
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# define a progress bar to show the training status
progress_bar = tqdm(range(num_training_steps))

# write the training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        # accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1377 [00:00<?, ?it/s]

In [6]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8774509803921569, 'f1': 0.9140893470790378}