In [1]:
import os
os.environ['HF_HOME'] = '/data1/malto/cache'

# Baseline (Text Classification)

In [2]:
from transformers import TrainingArguments
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np

os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ["WANDB_DISABLED"] = "true"
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

BATCH_SIZE = 4
NUM_EPOCHS = 5
MULTI_STAGES = False
FREEZE = False
FROZEN_LAYERS = 15

In [3]:
#checkpoint = "microsoft/deberta-v2-xxlarge-mnli" # too big cannot train all of it and freezing stuff is suboptimal
#checkpoint = "bert-base-uncased"
checkpoint = "microsoft/deberta-xlarge-mnli"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
from datasets import load_dataset

ds = load_dataset("json", data_files=["/data1/malto/shroom/val.model-agnostic.json"]).shuffle()
ds2 = load_dataset("json", data_files=["/data1/malto/shroom/trial-v1.json"])
ds = ds['train'].train_test_split(train_size=0.8) # more representative, apparently trial is easier or something
#ds['test'] = ds2['train']
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
        num_rows: 399
    })
    test: Dataset({
        features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
        num_rows: 100
    })
})

In [5]:
from datasets import DatasetDict
ds_task = DatasetDict()
ds_not_task = DatasetDict()
ds_task['train'] = ds['train'].filter(lambda x: x['task'] == "MT")
ds_task['test'] = ds['test'].filter(lambda x: x['task'] == "MT")

ds_not_task['train'] = ds['train'].filter(lambda x: x['task'] != "MT")
ds_not_task['test'] = ds['test'].filter(lambda x: x['task'] != "MT")

ds_task, ds_not_task

Filter:   0%|          | 0/399 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/399 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

(DatasetDict({
     train: Dataset({
         features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
         num_rows: 152
     })
     test: Dataset({
         features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
         num_rows: 35
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
         num_rows: 247
     })
     test: Dataset({
         features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
         num_rows: 65
     })
 }))

In [6]:
def preprocess_function(examples):
    texts = []
    for hyp, tgt, task in zip(examples["hyp"], examples['tgt'], examples['task']):
        texts.append(f"{hyp} {tokenizer.sep_token} {task} {tokenizer.sep_token} {tgt}")
    model_inputs = tokenizer(texts)
    model_inputs["label"] = [1 if t == "Hallucination" else 0 for t in examples['label']]
    return model_inputs

In [7]:
ds = ds.map(preprocess_function, batched=True)
ds = ds.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

if ds_task is not None:
    ds_task = ds_task.map(preprocess_function, batched=True)
    ds_task = ds_task.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

    ds_not_task = ds_not_task.map(preprocess_function, batched=True)
    ds_not_task = ds_not_task.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [10]:
id2label = {0: "Not Hallucination", 1: "Hallucination"}
label2id = {"Not Hallucination": 0, "Hallucination": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge-mnli and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
if FREEZE == True and checkpoint.startswith("microsoft"):
    print("freezing...")
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
    for param in model.deberta.encoder.layer[:FROZEN_LAYERS].parameters():
        param.requires_grad = False

In [13]:
training_args = TrainingArguments(
    output_dir="/data1/malto/shroom/checkpoint/local_model",
    learning_rate=1e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    logging_steps=1,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
def train_with_dataset(ds):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

In [15]:
if MULTI_STAGES == False:
    train_with_dataset(ds)
else:
    train_with_dataset(ds_not_task)
    train_with_dataset(ds_task)

***** Running training *****
  Num examples = 399
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 500


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5591,0.508947,0.79
2,0.3291,0.944425,0.79
3,0.0895,1.318534,0.75
4,0.0538,1.39008,0.78
5,0.025,1.475476,0.75


***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


