In [121]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import transformers

import datasets
import evaluate

In [122]:
# Task : Classification
# Data : glue/mrpc
# pretrained-model checkpoint = "bert-base-uncased"
device = torch.device("cuda") if torch.cuda.is_available() else 'mps' if torch.backend.mps.is_available() else torch.device("cpu")
device

device(type='cuda')

In [123]:
checkpoint = "bert-base-uncased"
dataset_cache = "../../data/external/"


In [124]:
# LOad pretrained dataset, tokenizer , model 
raw_dataset = datasets.load_dataset("glue","mrpc",cache_dir=dataset_cache)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
data_collator = transformers.DataCollatorWithPadding(tokenizer)
model = transformers.AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [125]:
raw_dataset.column_names

{'train': ['sentence1', 'sentence2', 'label', 'idx'],
 'validation': ['sentence1', 'sentence2', 'label', 'idx'],
 'test': ['sentence1', 'sentence2', 'label', 'idx']}

In [126]:
## Load DataSet into batches 
def tokenizer_func(example):
    return tokenizer(example['sentence1'],example['sentence2'],truncation=True)

In [127]:
tokenized_data = raw_dataset.map(tokenizer_func,batched=True,batch_size=50)

In [128]:
tokenized_data = tokenized_data.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_data = tokenized_data.rename_column('label',"labels")
tokenized_data.set_format("torch")
tokenized_data["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [129]:
train_dataloader = DataLoader(dataset=raw_data['train'],batch_size=50,shuffle=True,collate_fn=data_collator,num_workers=25,pin_memory=True)
val_dataloader = DataLoader(dataset=raw_data['validation'],batch_size=10,shuffle=True,collate_fn=data_collator,num_workers=25,pin_memory=True)
test_dataloader = DataLoader(dataset=raw_data['test'],batch_size=50,shuffle=True,collate_fn=data_collator,num_workers=25,pin_memory=True)

In [130]:
# Santity
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"   # Supress warning
for batch in train_dataloader:
    break
print("Singelbatch :",{k:v.shape for k,v in batch.items()})
output = model(**batch)
print("MOdel training output on single batch -> ","LOss : ", output.loss
      ,output.logits.shape)

Singelbatch : {'input_ids': torch.Size([50, 96]), 'token_type_ids': torch.Size([50, 96]), 'attention_mask': torch.Size([50, 96]), 'labels': torch.Size([50])}
MOdel training output on single batch ->  LOss :  tensor(0.6433, grad_fn=<NllLossBackward0>) torch.Size([50, 2])


In [131]:
optim = torch.optim.AdamW(model.parameters(),lr=5e-5)


In [132]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optim,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


In [133]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [134]:
%%time
# Train

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))



model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k:v.to(device) for k,v in batch.items()}
        out = model(**batch)
        loss = out.loss
        loss.backward()
    
        optim.step()
        lr_scheduler.step()
        optim.zero_grad()
        progress_bar.update(1)

  0%|          | 0/222 [00:00<?, ?it/s]

CPU times: user 28.6 s, sys: 10.2 s, total: 38.8 s
Wall time: 40.9 s


In [135]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8431372549019608, 'f1': 0.891156462585034}