In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
from tqdm import tqdm

In [3]:
import sys
sys.path.append('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification')
sys.path.append('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification/src')

In [7]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
model = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-german-cased")

# Specify the patter to use for this lexical simplifier
bert_pattern = '{original_sentence} The simpler version of the previous sentence is: {sentence_with_complex_word_masked}'

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
from simple_bert_lexical_simplifier import SimpleBertLexicalSimplifier
bert_ls = SimpleBertLexicalSimplifier(model, tokenizer, bert_pattern, None)

No exemplars provided, using zero-shot mode.


In [9]:
from benchmark_suite import BenchmarkSuite
from language import Language
suite = BenchmarkSuite(bert_ls, {Language.EN: '{original_sentence} The simpler version of the previous sentence is: {sentence_with_complex_word_masked}',
                                 Language.DE: '{original_sentence} Die vereinfachte Version des vorigen Satzes ist: {sentence_with_complex_word_masked}'})
suite.run()

Benchmarking model on EN ...
Benchmarking model on BenchLSDataProvider...


Benchmarking: 100%|██████████| 929/929 [00:13<00:00, 66.92it/s]


Benchmarking model on DE ...
Benchmarking model on GermanEvalDataProvider...
['Dies hat beispielsweise eine Ruhigstellung des Magen-Darmtrakts (Hemmung der Peristaltik) und eine Erweiterung der Bronchien zur Erleichterung der Atmung als Folge'
 'Der Maschinenbau hat in Europa durch die Bildung der EU eine starke Erleichterung erhalten'
 'In der Legislaturperiode 1998–2002 wurden unter anderem die Ökosteuer (allerdings in einer gegenüber grünen Vorstellungen reduzierten Form), einige Reformen des Staatsbürgerschaftsrechts bezüglich der Erleichterung von Einwanderung, die Möglichkeit eingetragener Lebenspartnerschaften und der langfristige Ausstieg aus der Atomenergie verabschiedet'
 ...
 'Übereinstimmung über das zu verwendende Vokabular erzielen, hält Popper für grundfalsch'
 'Auch wenn die Formgebung dieser Bauwerke teilweise wenig Ähnlichkeit mit dem Pariser Turm aufwies, genügte oft die Übereinstimmung von vier Turmfüßen und der konstruktionsbedingten Notwendigkeit einer Verjüngung 

Benchmarking: 100%|██████████| 1040/1040 [00:11<00:00, 86.67it/s]


In [None]:
from utils.bench_ls_data_provider import BenchLSDataProvider
bench_ls_data_provider = BenchLSDataProvider('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification/data/BenchLS/BenchLS.txt')
eval_data = bench_ls_data_provider.provide_data_as_numpy_array()

Test evaluation on the first datasample

In [None]:
# The metrics are inspired by the BenchLS Paper, they do not take into account the ranks of the predictions
# Potential: Proportion of instances in which at least one of the candidates generated is in the gold-standard.
# Precision: Proportion of generated substitutions that are in the gold-standard.
# Recall: The proportion of gold-standard substitutions that are among the generated substitutions.
# F1: The harmonic mean of precision and recall.

sample = eval_data[0]
print(f"This is the sample: {sample}")

predicted_tokens = bert_ls.generate_substitutions_for(sample[1], sample[0])
print(f"These are the predicted tokens: {predicted_tokens}")

def calculate_single_example_metric(sample, predicted_tokens):
    sample_potential = False
    sample_precision = 0
    sample_recall = 0
    sample_f1 = 0

    # Flatten the dict of gold standard substitutions
    gold_standard_substitutions = [word for sublist in sample[3].values() for word in sublist]

    # Check Potential & count Precision
    for prediction in predicted_tokens:
        if any(prediction == values for values in gold_standard_substitutions):
            sample_potential = True
            sample_precision += 1
    sample_precision = sample_precision / len(predicted_tokens)

    # Calculate Recall
    true_positives = sum(1 for token in gold_standard_substitutions if token in predicted_tokens)
    sample_recall = true_positives / len(gold_standard_substitutions) if gold_standard_substitutions else 0

    # Calculate F1
    if sample_precision + sample_recall != 0:
        sample_f1 = 2 * (sample_precision * sample_recall) / (sample_precision + sample_recall)

    return sample_potential, sample_precision, sample_recall, sample_f1

sample_potential, sample_precision, sample_recall, sample_f1 = calculate_single_example_metric(sample, predicted_tokens)

print("Potential: ", sample_potential)
print("Precision: ", sample_precision)
print("Recall: ", sample_recall)
print("F1: ", sample_f1)

In [None]:
def calculate_metrics(eval_data):
    potential = 0
    precision = 0
    recall = 0
    f1 = 0

    for sample in tqdm(eval_data, desc="Evaluating..."):
        predicted_tokens = bert_ls.generate_substitutions_for(sample[1], sample[0])
        sample_potential, sample_precision, sample_recall, sample_f1 = calculate_single_example_metric(sample, predicted_tokens)
        if sample_potential:
            potential += 1
        precision += sample_precision
        recall += sample_recall
        f1 += sample_f1

    potential = potential / len(eval_data)
    precision = precision / len(eval_data)
    recall = recall / len(eval_data)
    f1 = f1 / len(eval_data)

    return potential, precision, recall, f1

potential, precision, recall, f1 = calculate_metrics(eval_data)
print("\n")
print(f"Potential:    {potential:.3f}")
print(f"Precision:    {precision:.3f}")
print(f"Recall:       {recall:.3f}")
print(f"F1:           {f1:.3f}")