In [1]:
import evaluate
import numpy as np
import torch

from datasets import load_dataset, Dataset
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding, DataCollator
from transformers import Trainer
from transformers import TrainingArguments
from transformers.modeling_utils import load_state_dict
from typing import Any, Callable

In [2]:
checkpoint = "prajjwal1/bert-small"

In [3]:
def get_tokenizer(task: str, tokenizer: AutoTokenizer) -> Callable:
    if task in ['wnli', 'mrpc']:
        return lambda e : tokenizer(e["sentence1"], e["sentence2"], truncation=True)

In [4]:
def model_loader(ours_theirs: str, model_id: str, path2model: str) -> AutoModelForSequenceClassification:
    if ours_theirs == "ours":
        return AutoModelForSequenceClassification.from_pretrained(
            model_id,
            num_labels=2,
            state_dict=load_state_dict(path2model))
    if ours_theirs == "theirs":
        return AutoModelForSequenceClassification.from_pretrained(
            model_id,
            num_labels=2)
    else:
        raise Exception(f"Argument `ours_theirs` must be either 'ours' or 'theirs'. Current is '{str(ours_theirs)}'")

In [5]:
def train_and_eval(
        model: AutoModelForSequenceClassification,
        tokenized_datasets: Dataset, 
        training_args: TrainingArguments,
        metric: Any) -> dict:
    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )
    trainer.train()
    predictions: torch.Tensor = trainer.predict(tokenized_datasets["validation"])
    preds: np.ndarray = np.argmax(predictions.predictions, axis=-1)
    return metric.compute(predictions=preds, references=predictions.label_ids)

In [6]:
def evaluate_ours_theirs(model_path: str, model_id: str = "prajjwal1/bert-small") -> None:
    tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_id)
    data_collator: DataCollator = DataCollatorWithPadding(tokenizer=tokenizer)
    tasks: list = ['mrpc', 'wnli', ]

    training_args = TrainingArguments(
        "test-trainer",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        weight_decay=0.1,
        num_train_epochs=10,
        warmup_ratio=0.06,
        report_to="none"
    )
    results: dict = {}
    for task in tasks:
        raw_datasets = load_dataset("glue", task)
        metric = evaluate.load("glue", task)
        tokenized_datasets = raw_datasets.map(get_tokenizer(task, tokenizer), batched=True)
        # Original
        model = model_loader(
            'theirs',
            model_id,
            model_path)
        # result: dict = train_and_eval(model, tokenized_datasets, training_args, metric)
        results[task] = {'accuracy': 0.6617647058823529, 'f1': 0.7604166666666667} # result
    return results



In [7]:
evaluate_ours_theirs('/home/philko/Documents/Uni/WiSe2223/UnsupervisedLearning/udl-negation/models/final_model_large.pt', checkpoint)

Found cached dataset glue (/home/philko/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/philko/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-00e2ef77164107e4.arrow
Loading cached processed dataset at /home/philko/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-29a3ef0a5f0b928a.arrow
Loading cached processed dataset at /home/philko/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a5a7cd90b8245bfd.arrow
Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions

  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/philko/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-085fac57bd8ae851.arrow
Loading cached processed dataset at /home/philko/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-488aa89d8e64608e.arrow


Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a 

{'mrpc': {'accuracy': 0.6617647058823529, 'f1': 0.7604166666666667},
 'wnli': {'accuracy': 0.6617647058823529, 'f1': 0.7604166666666667}}