In [63]:
# !pip install accelerate
# To setup Distributed training with accelerate

In [64]:
from accelerate import Accelerator

accelerartor = Accelerator()

In [65]:
accelerartor

<accelerate.accelerator.Accelerator at 0x169644e10>

In [66]:
# setup the data
from datasets import load_dataset

dataset = load_dataset("Yelp/yelp_review_full")

In [67]:
dataset["train"], dataset["test"]

(Dataset({
     features: ['label', 'text'],
     num_rows: 650000
 }),
 Dataset({
     features: ['label', 'text'],
     num_rows: 50000
 }))

In [68]:
# u need tokeinizer to preoprocess the dataset
from transformers import AutoTokenizer

check_point = "google-bert/bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(check_point)

In [69]:
def tokenize_func(example):
    return tokenizer(example["text"], padding=True, truncation=True)


tokenized_dataset = dataset.map(tokenize_func, batched=True)

In [70]:
tokenized_dataset["train"],  dataset["train"]

(Dataset({
     features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 650000
 }),
 Dataset({
     features: ['label', 'text'],
     num_rows: 650000
 }))

In [71]:
#  If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))

In [72]:
small_train_dataset 

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [73]:
# Remove the text column because the model does not accept raw text as an input:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")


In [74]:
tokenized_dataset.set_format("torch")

In [75]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [76]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))

In [77]:
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

In [78]:
# create batchs of datatset using dataloader
from torch.utils.data import DataLoader

train_data_loader = DataLoader(dataset=small_train_dataset,
                               batch_size=8,
                               shuffle=True)

eval_data_loader = DataLoader(dataset=small_eval_dataset,
                              batch_size=8,
                              shuffle=False)

In [79]:
len(train_data_loader)

1250

In [80]:
# load the model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(check_point)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
# setup the loss and optimizer
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

# lr scheduler
from transformers import get_scheduler

epochs = 4
num_of_training_steps = epochs * len(train_data_loader)
num_of_training_steps

lr_scheduler = get_scheduler(optimizer=optimizer,
                             num_training_steps=num_of_training_steps,
                             name="linear",
                             num_warmup_steps=0
                             )

In [82]:
# device agonisticxe code
from accelerate.test_utils.testing import get_backend
import torch
device, _, _ = get_backend() 

model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [83]:
# prepar eto accelereate

train_data_loader, eval_data_loader, model, optimizer  = accelerartor.prepare(train_data_loader,
                                                                              eval_data_loader,
                                                                              model,
                                                                              optimizer)

In [84]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_of_training_steps))


model.train()
for epoch in range(epochs):
    for batch in train_data_loader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerartor.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|          | 20/5000 [00:51<3:34:40,  2.59s/it]
  0%|          | 1/5000 [00:02<3:50:55,  2.77s/it]

KeyboardInterrupt: 

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_data_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.318}

In [None]:
from pprint import pprint
model.eval()

for batch in eval_data_loader:
    pprint(batch)
    break

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='mps:0'),
 'input_ids': tensor([[  101, 14812, 16442,  ...,     0,     0,     0],
        [  101, 19383,  1303,  ...,     0,     0,     0],
        [  101, 12008, 27788,  ...,     0,     0,     0],
        ...,
        [  101,  3930, 13991,  ...,     0,     0,     0],
        [  101,  1284,  3523,  ...,     0,     0,     0],
        [  101,  6682,  3537,  ...,     0,     0,     0]], device='mps:0'),
 'labels': tensor([2, 4, 1, 4, 3, 4, 2, 3], device='mps:0'),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='mps:0')}


In [30]:
small_eval_dataset[0]

{'labels': tensor(2),
 'input_ids': tensor([  101, 14812, 16442,  1186,  1110,  1240,  1576,   118,  1104,   118,
          1103,   118,  6159,  1983,  1457, 23783,  3255,   119, 14380,  2930,
          1114, 13628,  1116,   188,  1979,  1158, 23982, 21315,  1213,  1103,
         17022,  7659,  1877,  1176,  1122,   112,   188,   170,  5953,  4974,
          3974,  1437,   119, 10672,   170,   185,  5765, 16426,  1104, 12375,
          1105, 21102,  1116,  1280,  1213,  1103,  1372,   119,   165,   183,
           165,   183,  1942, 27516,  1282,  1110,  1632,  1111,  5953,   119,
           109,   130,  1105,  1476,  1904,  1105,  1128,   112,  1231,  1149,
          1103,  1442,   119, 11205, 23830,   119,   119,   119,  1409,   146,
           112,   182,   172,  1611,  3970,   170, 19359,  1114,   176,  7728,
         11597,   117,  1134,   146,  1579,  1821,   117,   113,  1128,  1202,
          1315,   119,  5890,  1122,   114, 15688,  7738,   117, 26704,   117,
         23982,  

In [39]:
small_eval_dataset[0]

{'labels': tensor(2),
 'input_ids': tensor([  101, 14812, 16442,  1186,  1110,  1240,  1576,   118,  1104,   118,
          1103,   118,  6159,  1983,  1457, 23783,  3255,   119, 14380,  2930,
          1114, 13628,  1116,   188,  1979,  1158, 23982, 21315,  1213,  1103,
         17022,  7659,  1877,  1176,  1122,   112,   188,   170,  5953,  4974,
          3974,  1437,   119, 10672,   170,   185,  5765, 16426,  1104, 12375,
          1105, 21102,  1116,  1280,  1213,  1103,  1372,   119,   165,   183,
           165,   183,  1942, 27516,  1282,  1110,  1632,  1111,  5953,   119,
           109,   130,  1105,  1476,  1904,  1105,  1128,   112,  1231,  1149,
          1103,  1442,   119, 11205, 23830,   119,   119,   119,  1409,   146,
           112,   182,   172,  1611,  3970,   170, 19359,  1114,   176,  7728,
         11597,   117,  1134,   146,  1579,  1821,   117,   113,  1128,  1202,
          1315,   119,  5890,  1122,   114, 15688,  7738,   117, 26704,   117,
         23982,  

In [40]:
# 1. Set the model to evaluation mode
model.eval()

# 2. Get a single example from your dataset
single_example = small_eval_dataset[1]

# 3. Create a batch with just this single example
# Each key needs to be unsqueezed to add a batch dimension
batch = {
    'input_ids': single_example['input_ids'].unsqueeze(0),
    'attention_mask': single_example['attention_mask'].unsqueeze(0),
    'token_type_ids': single_example['token_type_ids'].unsqueeze(0),
    'labels': single_example['labels'].unsqueeze(0)
}

# 4. Move to the appropriate device
batch = {k: v.to(device) for k, v in batch.items()}

# 5. Perform inference without gradient calculation
with torch.no_grad():
    outputs = model(**batch)
    
    # Get the loss if needed
    loss = outputs.loss
    
    # Get the logits (raw predictions)
    logits = outputs.logits
    
    # Get the predicted class
    predictions = torch.argmax(logits, dim=-1)
    
    # Calculate probabilities if needed
    probs = torch.nn.functional.softmax(logits, dim=-1)
    
    print(f"True label: {batch['labels'].item()}")
    print(f"Predicted label: {predictions.item()}")
    print(f"Loss: {loss.item():.4f}")
    print(f"Confidence: {probs[0][predictions.item()].item():.4f}")

True label: 4
Predicted label: 1
Loss: -0.0000
Confidence: 0.8151
