# Baseline (Text Classification)

In [1]:
from transformers import TrainingArguments
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np
import os

os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

BATCH_SIZE = 10
NUM_EPOCHS = 10

checkpoint = "bert-base-uncased"
#checkpoint = "microsoft/deberta-xlarge-mnli"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [2]:
from datasets import load_dataset

ds = load_dataset("json", data_files=["/data1/malto/shroom/val.model-agnostic.json"])
ds2 = load_dataset("json", data_files=["/data1/malto/shroom/trial-v1.json"])
#ds = ds['train'].train_test_split(train_size=0.8)
ds['test'] = ds2['train']
ds

DatasetDict({
    train: Dataset({
        features: ['model', 'task', 'hyp', 'tgt', 'p(Hallucination)', 'labels', 'label', 'src', 'ref'],
        num_rows: 499
    })
    test: Dataset({
        features: ['model', 'task', 'hyp', 'tgt', 'p(Hallucination)', 'labels', 'label', 'src', 'ref'],
        num_rows: 80
    })
})

In [3]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["hyp"], examples['tgt'])
    model_inputs["label"] = [1 if t == "Hallucination" else 0 for t in examples['label']]
    return model_inputs

In [4]:
ds = ds.map(preprocess_function, batched=True)
ds = ds.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

In [5]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [7]:
id2label = {0: "Not Hallucination", 1: "Hallucination"}
label2id = {"Not Hallucination": 0, "Hallucination": 1}

In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [10]:
training_args = TrainingArguments(
    output_dir="/data1/malto/shroom/checkpoint/local_model",
    learning_rate=1e-3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    logging_steps=1,
    per_gpu_train_batch_size=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 499
  Num Epochs = 10
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7343,0.649313,0.55
2,0.847,0.673654,0.525
3,0.7611,0.672352,0.575
4,0.6948,0.654183,0.5875
5,0.6982,0.624871,0.6125
6,0.6673,0.625629,0.5875
7,0.6665,0.621882,0.625
8,0.6576,0.636726,0.5875
9,0.6571,0.621338,0.6375
10,0.6598,0.620416,0.6375


***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10
***** Running Evaluation *****
  Num examples = 80
  Batch size = 10


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1250, training_loss=0.7043776916503907, metrics={'train_runtime': 20.733, 'train_samples_per_second': 240.679, 'train_steps_per_second': 60.29, 'total_flos': 85094843080200.0, 'train_loss': 0.7043776916503907, 'epoch': 10.0})

### Overfitting ?

In [11]:
trainer.evaluate(ds['test'])

***** Running Evaluation *****
  Num examples = 80
  Batch size = 10


{'eval_loss': 0.6204155087471008,
 'eval_accuracy': 0.6375,
 'eval_runtime': 1.1909,
 'eval_samples_per_second': 67.176,
 'eval_steps_per_second': 6.718,
 'epoch': 10.0}

In [12]:
altro = load_dataset("json", data_files=["/data1/malto/shroom/val.model-aware.json"])
#altro = altro['train'].train_test_split(train_size=0.8)
altro = altro.map(preprocess_function, batched=True)
altro = altro.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])
trainer.evaluate(altro['train'])

***** Running Evaluation *****
  Num examples = 501
  Batch size = 10


{'eval_loss': 0.7934403419494629,
 'eval_accuracy': 0.5069860279441117,
 'eval_runtime': 1.9396,
 'eval_samples_per_second': 258.296,
 'eval_steps_per_second': 26.294,
 'epoch': 10.0}