In [60]:
from transformers import TrainingArguments
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers.tokenization_utils_base import BatchEncoding

BATCH_SIZE = 20
NUM_EPOCHS = 25

checkpoint = "distilbert-base-uncased"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [2]:
from datasets import load_dataset

ds = load_dataset("json", data_files=["/data1/malto/shroom/val.model-agnostic.json"])
ds2 = load_dataset("json", data_files=["/data1/malto/shroom/trial-v1.json"])
#ds = ds['train'].train_test_split(train_size=0.8)
ds['test'] = ds2['train']
ds

DatasetDict({
    train: Dataset({
        features: ['tgt', 'label', 'task', 'hyp', 'p(Hallucination)', 'model', 'ref', 'labels', 'src'],
        num_rows: 499
    })
    test: Dataset({
        features: ['tgt', 'label', 'task', 'hyp', 'p(Hallucination)', 'model', 'ref', 'labels', 'src'],
        num_rows: 80
    })
})

In [3]:
def preprocess_function(examples):
    inputs = [f"Hyp: < {hyp} > Tgt: < {tgt} >" for hyp, tgt in zip(examples["hyp"], examples['tgt'])]
    model_inputs = tokenizer(inputs)
    model_inputs["label"] = [1 if t == "Hallucination" else 0 for t in examples['label']]
    return model_inputs

In [4]:
ds = ds.map(preprocess_function, batched=True)
ds = ds.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

In [5]:
ds['train']

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 499
})

In [6]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [62]:
from typing import Any, Optional
import pytorch_lightning as pl
from pytorch_lightning.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
from transformers import AutoModelForSequenceClassification
import evaluate


def compute_metrics(predictions, labels):
    accuracy = evaluate.load("accuracy")
    return accuracy.compute(predictions=predictions, references=labels)


class LitModel(pl.LightningModule):

    def __init__(self, checkpoint: str) -> None:
        super().__init__()

        self.lr = 0.001
        id2label = {0: "Not Hallucination", 1: "Hallucination"}
        label2id = {"Not Hallucination": 0, "Hallucination": 1}

        self.model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
        self.loss_fn = torch.nn.CrossEntropyLoss()
    
    def configure_optimizers(self) -> OptimizerLRScheduler:
        return torch.optim.Adam(self.parameters(), lr = self.lr)
    
    def forward(self, data):
        return self.model(data)

    def loss_function(self, outputs, labels):
        return self.loss_fn(outputs, labels)

    def training_step(self, batch, batch_idx):
        
        labels = batch['labels']

        #dict = {"input_ids": batch['input_ids'], "attention_mask": batch['attention_mask']}

        inputs = BatchEncoding(dict) 

        outputs = self.model(batch['input_ids'], batch['attention_mask'])
        outputs_labels = np.argmax(outputs.logits.detach().cpu().numpy(), axis=1)
        print("outputs_labels shape", outputs_labels.shape)

        outputs_labels = torch.Tensor(outputs_labels)
        outputs_labels = outputs_labels.type(dtype=torch.float32)
        loss = self.loss_function(outputs_labels, labels)

        self.log("train_loss", loss.item(), prog_bar=True)
        return {"loss": loss}
    
    def inference_step(self, batch):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        dict = {"input_ids": input_ids, "attention_mask": attention_mask}

        inputs = BatchEncoding(dict)
        
        outputs = self.model(**inputs)
        return outputs
    
    def validation_step(self, batch, batch_idx):
        return self.inference_step(batch)
    
    def test_step(self, batch, batch_idx):
        return self.inference_step(batch)
    
    def inference_epoch_end(self, outputs, inference_batch):
        labels = inference_batch['labels']
        outputs_labels = np.argmax(outputs.logits.detach().cpu().numpy(), axis=1)

        self.log("accuracy", evaluate.load("accuracy").compute(predictions=outputs_labels, references=labels), prog_bar=True)

In [63]:
# create a dataloader starting from DatasetDict 

train_dataloader = DataLoader(ds['train'], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True)
test_dataloader = DataLoader(ds['test'], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True) 
#val_dataloader = DataLoader(ds['validation'], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True)
val_dataloader = test_dataloader

model = LitModel(checkpoint)

trainer = pl.Trainer(accelerator = 'gpu',  
                     max_epochs = NUM_EPOCHS, 
                     )

trainer.validate(model = model,
                 dataloaders = val_dataloader)

trainer.fit(model = model, 
            train_dataloaders = train_dataloader, 
            val_dataloaders = val_dataloader)

trainer.test(model = model, 
             test_dataloaders=test_dataloader)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.70 GiB total capacity; 22.21 GiB already allocated; 13.75 MiB free; 22.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF