In [120]:
from transformers import TrainingArguments
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers.tokenization_utils_base import BatchEncoding
from typing import List, Dict

BATCH_SIZE = 20
NUM_EPOCHS = 10

checkpoint = "distilbert-base-uncased"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [77]:
from datasets import load_dataset

ds = load_dataset("json", data_files=["/data1/malto/shroom/val.model-agnostic.json"])
ds2 = load_dataset("json", data_files=["/data1/malto/shroom/trial-v1.json"])
#ds = ds['train'].train_test_split(train_size=0.8)
ds['test'] = ds2['train']
ds

DatasetDict({
    train: Dataset({
        features: ['model', 'task', 'hyp', 'tgt', 'p(Hallucination)', 'labels', 'label', 'src', 'ref'],
        num_rows: 499
    })
    test: Dataset({
        features: ['model', 'task', 'hyp', 'tgt', 'p(Hallucination)', 'labels', 'label', 'src', 'ref'],
        num_rows: 80
    })
})

In [78]:
def preprocess_function(examples):
    #inputs = [f"Hyp: < {hyp} > Tgt: < {tgt} >" for hyp, tgt in zip(examples["hyp"], examples['tgt'])]
    inputs_hyp = tokenizer(examples["hyp"])
    inputs_tgt = tokenizer(examples["tgt"])
    model_inputs = {}
    model_inputs['hyp_input_ids'] = inputs_hyp['input_ids']
    model_inputs['hyp_attention_mask'] = inputs_hyp['attention_mask']
    model_inputs['tgt_input_ids'] = inputs_tgt['input_ids']
    model_inputs['tgt_attention_mask'] = inputs_tgt['attention_mask']
    model_inputs["label"] = [1 if t == "Hallucination" else 0 for t in examples['label']]
    return model_inputs

In [79]:
ds['train'][0].keys()

dict_keys(['model', 'task', 'hyp', 'tgt', 'p(Hallucination)', 'labels', 'label', 'src', 'ref'])

In [80]:
ds = ds.map(preprocess_function, batched=True)
ds = ds.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

In [85]:
ds['test'][0].keys()

dict_keys(['label', 'hyp_input_ids', 'hyp_attention_mask', 'tgt_input_ids', 'tgt_attention_mask'])

In [82]:
#from transformers import DataCollatorWithPadding
#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [221]:
def custom_collate(features: List[Dict[str, Any]]) -> Dict[str, Any]:
    batch = {}
    if 'label' in features[0].keys():
        batch['labels'] = torch.tensor([el['label'] for el in features])
        for feature in features:
            del feature['label']
    max_len = {}
    keys = features[0].keys()
    
    for key in keys:
        maximum = max([len(el[key]) for el in features])
        for el in features:
            el[key].extend(max(0, maximum - len(el[key]))*[0])
    for key in keys:
        feature = torch.tensor([el[key] for el in features])
        batch[key] = feature
    return batch

In [255]:
from typing import Any, Optional
import pytorch_lightning as pl
from pytorch_lightning.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
from transformers import AutoModelForSequenceClassification, BertModel
import evaluate
from transformers import BatchEncoding

def compute_metrics(predictions, labels):
    accuracy = evaluate.load("accuracy")
    return accuracy.compute(predictions=predictions, references=labels)


class LitModel(pl.LightningModule):

    def __init__(self, checkpoint: str="bert-base-uncased") -> None:
        super().__init__()

        self.lr = 0.001
        id2label = {0: "Not Hallucination", 1: "Hallucination"}
        label2id = {"Not Hallucination": 0, "Hallucination": 1}

        self.encoder = BertModel.from_pretrained(checkpoint)
        self.decoder = torch.nn.Sequential(*[
            torch.nn.Linear(in_features=2*768, out_features=2*768, bias=True),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(in_features=2*768, out_features=2*768, bias=True),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(p=0.2, inplace=False),
            torch.nn.Linear(in_features=2*768, out_features=2, bias=True),
        ])
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def configure_optimizers(self) -> OptimizerLRScheduler:
        return torch.optim.Adam(self.parameters(), lr = self.lr)

    def forward(self, batch):
        encoding_hyp = self.encoder(batch['hyp_input_ids'], batch['hyp_attention_mask'])
        encoding_tgt = self.encoder(batch['tgt_input_ids'], batch['tgt_attention_mask'])
        
        encoding = torch.concat((encoding_hyp.pooler_output, encoding_tgt.pooler_output), dim=1)
        outputs = self.decoder(encoding)
        return outputs

    def loss_function(self, outputs, labels):
        return self.loss_fn(outputs, labels)

    def training_step(self, batch, batch_idx):
        labels = torch.Tensor(batch['labels'])
        outputs = self.forward(batch)
        loss = self.loss_function(outputs, labels)
        self.log("train_loss", loss.item(), prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        labels = torch.Tensor(batch['labels'])
        outputs = self.forward(batch)
        loss = self.loss_function(outputs, labels)
        self.log("val_loss", loss.item(), prog_bar=True)
        return {"loss": loss}

    def test_step(self, batch, batch_idx):
        return self.inference_step(batch)

    def inference_epoch_end(self, outputs, inference_batch):
        labels = inference_batch['labels']
        outputs_labels = np.argmax(outputs.logits.detach().cpu().numpy(), axis=1)

        self.log("accuracy", evaluate.load("accuracy").compute(predictions=outputs_labels, references=labels), prog_bar=True)

In [257]:
from pytorch_lightning.loggers import CSVLogger
import yaml
from pathlib import Path

with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)
BASE_DIR = Path(config['base_dir'])

# create a dataloader starting from DatasetDict

train_dataloader = DataLoader(ds['train'], batch_size=BATCH_SIZE, collate_fn=custom_collate, shuffle=True)
test_dataloader = DataLoader(ds['test'], batch_size=BATCH_SIZE, collate_fn=custom_collate, shuffle=True)
#val_dataloader = DataLoader(ds['validation'], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True)
val_dataloader = test_dataloader

model = LitModel()

trainer = pl.Trainer(accelerator='gpu',
                     max_epochs=5,
                     logger=CSVLogger(save_dir=BASE_DIR / "logs"),
                     log_every_n_steps=1
                     )

trainer.fit(model=model,
            train_dataloaders=train_dataloader,
            val_dataloaders=val_dataloader)


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOC

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [258]:
trainer.fit(model = model,
            train_dataloaders = train_dataloader,
            val_dataloaders = val_dataloader)

trainer.test(model = model,
             dataloaders=test_dataloader)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | encoder | BertModel        | 109 M 
1 | decoder | Sequential       | 4.7 M 
2 | loss_fn | CrossEntropyLoss | 0     
---------------------------------------------
114 M     Trainable params
0         Non-trainable params
114 M     Total params
456.828   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

AttributeError: 'LitModel' object has no attribute 'inference_step'