Load the data and modify it as needed

In [5]:
import pandas as pd
data = pd.read_csv('training_data/training_data/ED/train.csv')
dev = pd.read_csv('training_data/training_data/ED/dev.csv')

#BERT takes one input sentence, so we need to concatenate the claim and the evidence
data['text'] = data['Claim'] + " [SEP] " + data['Evidence']
dev['text'] = dev['Claim'] + " [SEP] " + dev['Evidence']


Tokenization and labelling

In [6]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize training data
train_inputs = tokenizer(list(data['text']), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
train_labels = torch.tensor(data['label'].values)

# Create a TensorDataset for the training data
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_sampler = RandomSampler(train_dataset)  # Randomly sample for training
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=8)

# Tokenize validation data
dev_inputs = tokenizer(list(dev['text']), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
dev_labels = torch.tensor(dev['label'].values)

# Create a TensorDataset for the validation data
dev_dataset = TensorDataset(dev_inputs['input_ids'], dev_inputs['attention_mask'], dev_labels)
dev_dataloader = DataLoader(dev_dataset, batch_size=8)  # No need for a sampler here

Load the Pre-trained BERT Model

In [7]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting up training enviornment

In [4]:
from torch.optim import AdamW
from transformers import get_scheduler

#Adam weighted optimizer ,model.parameters() gets weights and biases, lr == learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(inputs['input_ids'])
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


NameError: name 'inputs' is not defined

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

Training loop


In [9]:
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0

    for batch in dev_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print(f"Validation Accuracy: {eval_accuracy / nb_eval_steps:.4f}")

NameError: name 'lr_scheduler' is not defined