# Baseline (Text Classification)

In [13]:
from transformers import TrainingArguments
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np
import os

os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ["WANDB_DISABLED"] = "true"
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

BATCH_SIZE = 30
NUM_EPOCHS = 7
FREEZE = True

#checkpoint = "microsoft/deberta-v2-xxlarge-mnli" # too big cannot train all of it and freezing stuff is suboptimal
#checkpoint = "bert-base-uncased"
checkpoint = "microsoft/deberta-xlarge-mnli"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

loading configuration file https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/config.json from cache at /home/fborra/.cache/huggingface/transformers/0377df7d05ddc8e629ee37470a404b65edf531e9912dd70715ebd8358f8b2b28.543c5fdf83b4099c4885a4668dee5dddf66b77cd9a7c1c0bcbe7130cd926e467
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v2-xxlarge-mnli",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "CONTRADICTION",
    "1": "NEUTRAL",
    "2": "ENTAILMENT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "CONTRADICTION": 0,
    "ENTAILMENT": 2,
    "NEUTRAL": 1
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "

In [14]:
from datasets import load_dataset

ds = load_dataset("json", data_files=["/data1/malto/shroom/val.model-agnostic.json"]).shuffle()
ds2 = load_dataset("json", data_files=["/data1/malto/shroom/trial-v1.json"])
#ds = ds['train'].train_test_split(train_size=0.8)
ds['test'] = ds2['train']
ds

DatasetDict({
    train: Dataset({
        features: ['hyp', 'labels', 'model', 'label', 'tgt', 'ref', 'task', 'p(Hallucination)', 'src'],
        num_rows: 499
    })
    test: Dataset({
        features: ['hyp', 'labels', 'model', 'label', 'tgt', 'ref', 'task', 'p(Hallucination)', 'src'],
        num_rows: 80
    })
})

In [15]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["hyp"], examples['tgt'])
    model_inputs["label"] = [1 if t == "Hallucination" else 0 for t in examples['label']]
    return model_inputs

In [16]:
ds = ds.map(preprocess_function, batched=True)
ds = ds.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

In [17]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [19]:
id2label = {0: "Not Hallucination", 1: "Hallucination"}
label2id = {"Not Hallucination": 0, "Hallucination": 1}

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

loading configuration file https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/config.json from cache at /home/fborra/.cache/huggingface/transformers/0377df7d05ddc8e629ee37470a404b65edf531e9912dd70715ebd8358f8b2b28.543c5fdf83b4099c4885a4668dee5dddf66b77cd9a7c1c0bcbe7130cd926e467
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v2-xxlarge-mnli",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "Not Hallucination",
    "1": "Hallucination"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "Hallucination": 1,
    "Not Hallucination": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_no

In [21]:
if FREEZE == True and checkpoint == "microsoft/deberta-v2-xxlarge-mnli":
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
    for param in model.deberta.encoder.layer[:20].parameters():
        param.requires_grad = False

In [25]:
training_args = TrainingArguments(
    output_dir="/data1/malto/shroom/checkpoint/local_model",
    learning_rate=1e-6,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    logging_steps=1,
    per_gpu_train_batch_size=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 499
  Num Epochs = 7
  Instantaneous batch size per device = 30
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation s

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3362,0.372853,0.9
2,0.2785,0.535496,0.8625
3,0.2437,0.522879,0.875
4,0.234,0.538651,0.8875
5,0.2247,0.526445,0.8875
6,0.2114,0.534487,0.8875
7,0.2155,0.538656,0.8875


***** Running Evaluation *****
  Num examples = 80
  Batch size = 30
***** Running Evaluation *****
  Num examples = 80
  Batch size = 30
***** Running Evaluation *****
  Num examples = 80
  Batch size = 30
***** Running Evaluation *****
  Num examples = 80
  Batch size = 30
***** Running Evaluation *****
  Num examples = 80
  Batch size = 30
***** Running Evaluation *****
  Num examples = 80
  Batch size = 30
***** Running Evaluation *****
  Num examples = 80
  Batch size = 30


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=875, training_loss=0.24912222072056361, metrics={'train_runtime': 262.5077, 'train_samples_per_second': 13.306, 'train_steps_per_second': 3.333, 'total_flos': 919926053275740.0, 'train_loss': 0.24912222072056361, 'epoch': 7.0})

### Overfitting ?

In [23]:
trainer.evaluate(ds['test'])

***** Running Evaluation *****
  Num examples = 80
  Batch size = 30


{'eval_loss': 0.35356172919273376,
 'eval_accuracy': 0.8875,
 'eval_runtime': 2.0619,
 'eval_samples_per_second': 38.799,
 'eval_steps_per_second': 1.455,
 'epoch': 7.0}

In [24]:
altro = load_dataset("json", data_files=["/data1/malto/shroom/val.model-aware.json"])
#altro = altro['train'].train_test_split(train_size=0.8)
altro = altro.map(preprocess_function, batched=True)
altro = altro.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])
trainer.evaluate(altro['train'])

***** Running Evaluation *****
  Num examples = 501
  Batch size = 30


{'eval_loss': 1.2731963396072388,
 'eval_accuracy': 0.590818363273453,
 'eval_runtime': 7.7557,
 'eval_samples_per_second': 64.598,
 'eval_steps_per_second': 2.192,
 'epoch': 7.0}