In [1]:
import os
os.environ["USE_TF"] = "0"

from datasets import load_dataset, DatasetDict

raw = load_dataset("snli")
def to_binary(example):
    if example["label"] == -1:
        return None
    return {"label": 1 if example["label"] == 0 else 0}
binary = raw.map(to_binary).filter(lambda x: x is not None)

# wrap into a DatasetDict
data = DatasetDict({
    split: binary[split]
    for split in ("train", "validation", "test")
})
print(data)


DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9842
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
})


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

def tokenize_fn(ex):
    return tokenizer(
        ex["premise"],
        ex["hypothesis"],
        truncation=True,
        max_length=128
    )

tokenized = data.map(tokenize_fn, batched=True)
tokenized = tokenized.remove_columns(["premise", "hypothesis"])  # keep only tokens + label
tokenized.set_format("torch")

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
import numpy as np

# model
model = AutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=2
)
data_collator = DataCollatorWithPadding(tokenizer)

accuracy = evaluate.load("accuracy")
f1       = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1":       f1.compute(predictions=preds, references=labels)["f1"]
    }


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
training_args = TrainingArguments(
    output_dir="./snli-paraphrase-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset= tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [5]:
trainer.train()
results = trainer.evaluate(tokenized["test"])
print("Results", results)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3369,0.269354,0.895143,0.839302
2,0.3081,0.252958,0.900935,0.851981
3,0.2887,0.249968,0.901748,0.852073


Results {'eval_loss': 0.26040199398994446, 'eval_accuracy': 0.8984120521172638, 'eval_f1': 0.8494266747133373, 'eval_runtime': 3.8366, 'eval_samples_per_second': 2560.572, 'eval_steps_per_second': 80.018, 'epoch': 3.0}
