In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification')
sys.path.append('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification/src')

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
model = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-german-cased")

# Specify the patter to use for this lexical simplifier
bert_pattern = '{original_sentence} Die einfachere Version der vorigen Satzes ist: {sentence_with_complex_word_masked}'

In [None]:
from german_bert_lexical_simplifier import GermanBertLexicalSimplifier
german_bert_ls = GermanBertLexicalSimplifier(model, tokenizer, bert_pattern, None)

In [None]:
from utils.process_BenchLS import process_BenchLS
eval_data = process_BenchLS('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification/data/BenchLS.txt')

Test evaluation on the first datasample

In [None]:
# The metrics are inspired by the BenchLS Paper, they do not take into account the ranks of the predictions
# Potential: Proportion of instances in which at least one of the candidates generated is in the gold-standard.
# Precision: Proportion of generated substitutions that are in the gold-standard.
# Recall: The proportion of gold-standard substitutions that are among the generated substitutions.
# F1: The harmonic mean of precision and recall.

sample = eval_data[0]
print(f"This is the sample: {sample}")

predicted_tokens = german_bert_ls.generate_substitutions_for(sample[1], sample[0])
print(f"These are the predicted tokens: {predicted_tokens}")

sample_potential = False
sample_precision = 0
sample_recall = 0
sample_f1 = 0

# Flatten the dict of gold standard substitutions
gold_standard_substitutions = [word for sublist in sample[3].values() for word in sublist]

# Check Potential & count Precision
for prediction in predicted_tokens:
    if any(prediction == values for values in gold_standard_substitutions):
        sample_potential = True
        sample_precision += 1
sample_precision = sample_precision / len(predicted_tokens)

# Calculate Recall
true_positives = sum(1 for token in gold_standard_substitutions if token in predicted_tokens)
sample_recall = true_positives / len(gold_standard_substitutions) if gold_standard_substitutions else 0

# Calculate F1
if sample_precision + sample_recall != 0:
    sample_f1 = 2 * (sample_precision * sample_recall) / (sample_precision + sample_recall)

print("Potential: ", sample_potential)
print("Precision: ", sample_precision)
print("Recall: ", sample_recall)
print("F1: ", sample_f1)