In [1]:
import os
os.environ['HF_HOME'] = '/data1/malto/cache'

In [2]:
from transformers import Trainer
import torch.nn as nn

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        p_hall = inputs.pop("p(Hallucination)")
        cond_weights = inputs.pop("C-W")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")[:, 0]
        loss_fn = nn.BCEWithLogitsLoss(reduction='none')
        loss = cond_weights * loss_fn(logits, p_hall)
        loss = loss.mean()
        return (loss, outputs) if return_outputs else loss

    """def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys):
        phall = inputs.pop("p(Hallucination)")
        cw = inputs.pop("C-W")
        loss, logits, labels = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
        inputs['p(Hallucination)'] = phall
        inputs['C-W'] = cw
        loss = self.compute_loss(model, inputs)
        return loss, logits, labels"""

In [3]:
from transformers.trainer_pt_utils import is_sagemaker_mp_enabled

is_sagemaker_mp_enabled()

False

In [4]:
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
from pathlib import Path

os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ["WANDB_DISABLED"] = "true"

BATCH_SIZE = 4
NUM_EPOCHS = 2
BASE_DIR = Path("/data1/malto/shroom/")

checkpoint = "microsoft/deberta-xlarge-mnli"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
def preprocess_function(examples): # not batched
    model_inputs = tokenizer(examples['hyp'], examples['tgt'] if examples['ref'] != 'src' else examples['src'])
    model_inputs["labels"] = [1 if t == "Hallucination" else 0 for t in examples['labels']]
    return model_inputs

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "Not Hallucination", 1: "Hallucination"}
label2id = {"Not Hallucination": 0, "Hallucination": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge-mnli and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from datasets import load_dataset, concatenate_datasets
# dataset manipulation
ds_mt = load_dataset("json", data_files=[str(BASE_DIR / f"train_labeled_MT_SOLAR.model-agnostic.json")])
ds_dm = load_dataset("json", data_files=[str(BASE_DIR / f"train_labeled_DM_SOLAR.model-agnostic.json")])
ds_pg = load_dataset("json", data_files=[str(BASE_DIR / f"train_labeled_PG_SOLAR.model-agnostic.json")])
ds_val = load_dataset("json", data_files=[str(BASE_DIR / f"val.model-agnostic.json")])

In [7]:
ds_mt = ds_mt.remove_columns([el for el in ds_mt['train'].column_names if el not in ds_val['train'].column_names])
ds_dm = ds_dm.remove_columns([el for el in ds_dm['train'].column_names if el not in ds_val['train'].column_names])
ds_pg = ds_pg.remove_columns([el for el in ds_pg['train'].column_names if el not in ds_val['train'].column_names])

In [8]:
ds = concatenate_datasets([ds_mt['train'], ds_dm['train'], ds_pg['train'], ds_val['train']])
ds = ds.train_test_split(test_size=0.2, seed=1337)
ds = ds.map(preprocess_function)
ds = ds.remove_columns(['hyp', 'src', 'task', 'ref', 'tgt', 'model', 'labels', 'label'])

In [9]:
training_args = TrainingArguments(
    output_dir="/data1/malto/shroom/checkpoint/local_model",
    learning_rate=1e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    logging_steps=1,
    report_to="none",
    remove_unused_columns=False,
)

trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"].select(range(1_000)),
        eval_dataset=ds["test"].select(range(1_000)),
        tokenizer=tokenizer,
        data_collator=data_collator,
        #compute_metrics=compute_metrics,
    )

trainer.label_names = []
trainer.can_return_loss = True

In [10]:
trainer.evaluate()

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.0799412652850151,
 'eval_runtime': 9.7998,
 'eval_samples_per_second': 102.043,
 'eval_steps_per_second': 25.511}

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0556,0.04715
2,0.0357,0.04945


TrainOutput(global_step=500, training_loss=0.04564853477478027, metrics={'train_runtime': 124.0079, 'train_samples_per_second': 16.128, 'train_steps_per_second': 4.032, 'total_flos': 265079329444848.0, 'train_loss': 0.04564853477478027, 'epoch': 2.0})