In [4]:
%%capture
!pip install transformers
!pip install torch
!pip install json

In [23]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
import torch
import json
import os

In [41]:
def load_model_and_tokenizer(model_name):
    """
    Lade Modell.

    Args:
    model_name (str): Der Name des Modells, das geladen werden soll.

    Returns:
    tokenizer: Der geladene Tokenizer.
    model: Das geladene Modell.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    return tokenizer, model

def load_data(file_path):
    """
    Laden von JSON Lines Daten aus der gegebenen Datei.

    Args:
    file_path (str): Der Pfad zur Datei, die geladen werden soll.

    Returns:
    data: Die geladenen Daten.
    """
    data = []
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return data


def predict_masked_token(model, tokenizer, sentence, options):
    """
    Berechne wahrscheinlichstes Token, dass die Mask passt

    Args:
    model: Das model, dass zur Berechnung verwendet wird
    tokenizer: Der Tokenizer der zum Berechnen wird
    sentence (str): der satz mit der Maske
    options (list): die Liste mit Einsetzungsoptionen

    Returns:
    predicted_option (str): die berechnete Option
    """
    # Encode the sentence and options
    inputs = tokenizer.encode_plus(sentence, return_tensors='pt', padding='max_length', max_length=20)
    option_ids = tokenizer.convert_tokens_to_ids(options)

    # Get the id of the masked token
    mask_id = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # Feed the input to the model
    outputs = model(**inputs)
    logits = outputs.logits
    mask_index = inputs['input_ids'][0].tolist().index(mask_id)
    mask_logits = logits[0, mask_index, :]

    # Find the most probable option
    probs = torch.nn.functional.softmax(mask_logits, dim=0)
    option_probs = probs[option_ids]
    max_prob_index = torch.argmax(option_probs).item()
    predicted_option = options[max_prob_index]

    return predicted_option

def evaluate_model(data, tokenizer, model):
    """
    Evaluieren Sie das Modell auf den gegebenen Daten.

    Args:
    data (list): Die Daten, auf denen das Modell evaluiert werden soll.
    model_name (str): Der Name des Modells, das evaluiert werden soll.

    Returns:
    results (list): Eine Liste von Worterbüchern, die die Originalsätze, die vorhergesagten Token und die korrekten Antworten enthalten.
    """

    results = []
    for item in data:
        # Extract the question stem and choices from the item
        stem = item['question']['stem'].replace('[MASK]', tokenizer.mask_token)
        choices = [choice['text'] for choice in item['question']['choices']]

        # Predict the masked token
        predicted_token = predict_masked_token(model, tokenizer, stem, choices)

        # Find the correct answer based on the 'answerKey'
        correct_answer = [choice['text'] for choice in item['question']['choices'] if choice['label'] == item['answerKey']][0]

        # Add the results to the results list
        results.append({
            'id': item['id'],
            'original_sentence': item['question']['stem'],
            'predicted_token': predicted_token,
            'correct_answer': correct_answer
        })
    return results



def interpret_results(results):
    """
    Interpretieren von Ergebnissen der Modellbewertung.

    Args:
    results (list): Die Ergebnisse der Modellbewertung.

    Returns:
    None
    """
    for result in results:
        print(f"Original sentence: {result['original_sentence']}")
        print(f"Predicted token: {result['predicted_token']}")
        print(f"Correct answer: {result['correct_answer']}")
        print("Correct!" if result['predicted_token'] == result['correct_answer'] else "Incorrect!")
        print("------------------------")

def scores_results(results):
    """
    Gibt Accuracy aus.

    Args:
    results (list): Eine Liste von Wörterbüchern, die die Originalsätze, die vorhergesagten Token und die korrekten Antworten enthalten.

    Returns:
    None
    """
    correct = 0
    total = len(results)

    for result in results:
        if result['predicted_token'] == result['correct_answer']:
            correct += 1

    print(f"Accuracy: {correct / total * 100:.2f}%")


In [7]:
# Laden Modelle
tokenizer_roberta, model_roberta = load_model_and_tokenizer("roberta-base")
tokenizer_bert, model_bert = load_model_and_tokenizer("bert-base-cased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Laden der Daten
always_never_data = load_data("data/challenge_coffee_cats_quantifiers_coffee_cats_quantifiers_dev.jsonl")
age_comparison_data = load_data("data/challenge_number_comparison_number_comparison_age_compare_masked_train.jsonl")

In [46]:
# Evaluieren und Interpretieren von Ergebnissen für das Roberta-Modell
print("Testing roberta-base on always_never data")
results_1 = evaluate_model(always_never_data,tokenizer_roberta,model_roberta)
interpret_results(results_1)

print("Testing roberta-base on age_comparison data")
results_2 = evaluate_model(age_comparison_data, tokenizer_roberta,model_roberta)
interpret_results(results_2)


# Evaluieren und Interpretieren von Ergebnisse für das BERT-Modell
print("Testing bert-base-cased on always_never data")
results_3 = evaluate_model(always_never_data, tokenizer_bert, model_bert)
interpret_results(results_3)

print("Testing bert-base-cased on age_comparison data")
results_4 = evaluate_model(age_comparison_data, tokenizer_bert, model_bert)
interpret_results(results_4)

Testing roberta-base on always_never data
Original sentence: A robin [MASK] has a horn .
Predicted token: always
Correct answer: never
Incorrect!
------------------------
Original sentence: A canine [MASK] has a jaw .
Predicted token: always
Correct answer: always
Correct!
------------------------
Original sentence: A ape [MASK] has a neck .
Predicted token: always
Correct answer: always
Correct!
------------------------
Original sentence: A dish with cheese [MASK] contains honey .
Predicted token: often
Correct answer: rarely
Incorrect!
------------------------
Original sentence: A badger [MASK] has a snout .
Predicted token: always
Correct answer: always
Correct!
------------------------
Original sentence: hammock is [MASK] placed in the corner .
Predicted token: always
Correct answer: sometimes
Incorrect!
------------------------
Original sentence: A ostrich [MASK] has a horn .
Predicted token: always
Correct answer: never
Incorrect!
------------------------
Original sentence: A ins

In [47]:
print(f"for always_never data using roberta-base {scores_results(results_1)}\n")

print(f"for age_comparison data using roberta-base {scores_results(results_2)}\n")

print(f"for always_never data using bert-base-cased {scores_results(results_3)}\n")

print(f"for age_comparison data using bert-base-cased {scores_results(results_4)}\n")


Accuracy: 23.21%
for always_never data using roberta-base None

Accuracy: 50.00%
for age_comparison data using roberta-base None

Accuracy: 22.14%
for always_never data using bert-base-cased None

Accuracy: 50.17%
for age_comparison data using bert-base-cased None



Like already stated in the oLMpics Test Paper, in the MC-MLM Test Setting, bert-base and robert-base Zero-Shot Encodings compute about the same Accuracy in the Test