In [1]:
from transformers import TrainingArguments
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

ds = load_dataset("json", data_files=["dataset/trial-v1.json", "dataset/trial-v1.json"])
ds = ds['train'].train_test_split(train_size=0.8)
ds

DatasetDict({
    train: Dataset({
        features: ['src', 'label', 'hyp', 'p(Hallucination)', 'model', 'labels', 'tgt', 'ref', 'task'],
        num_rows: 128
    })
    test: Dataset({
        features: ['src', 'label', 'hyp', 'p(Hallucination)', 'model', 'labels', 'tgt', 'ref', 'task'],
        num_rows: 32
    })
})

In [3]:
ds['train'][0]

{'src': 'He climbed over the sedge and eely oarweeds and sat on a stool of rock , resting his ashplant in a <define> grike </define> .',
 'label': 'Hallucination',
 'hyp': '(Australia, New Zealand, colloquial) A tree.',
 'p(Hallucination)': 1.0,
 'model': '',
 'labels': ['Hallucination', 'Hallucination', 'Hallucination'],
 'tgt': '(chiefly, British) A deep cleft formed in limestone surfaces due to water erosion; providing a unique habitat for plants.',
 'ref': 'tgt',
 'task': 'DM'}

In [4]:
def preprocess_function(examples):
    #inputs = [f"Hyp: < {hyp} > Tgt: < {tgt} >" for hyp, tgt in zip(examples["hyp"], examples['tgt'])]
    model_inputs = tokenizer(examples["tgt"], examples["hyp"], max_length=1024, truncation=True)
    model_inputs["label"] = [1 if t == "Hallucination" else 0 for t in examples['label']]
    return model_inputs

In [5]:
ds = ds.map(preprocess_function, batched=True)
ds = ds.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

Map: 100%|██████████| 128/128 [00:00<00:00, 3458.51 examples/s]
Map: 100%|██████████| 32/32 [00:00<00:00, 1184.36 examples/s]


In [6]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 4.21MB/s]


In [8]:
id2label = {0: "Not Hallucination", 1: "Hallucination"}
label2id = {"Not Hallucination": 0, "Hallucination": 1}

In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

Downloading model.safetensors: 100%|██████████| 268M/268M [25:00<00:00, 179kB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir="local_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`