In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import sys
sys.path.append('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification')
sys.path.append('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification/src')

In [9]:
from lexical_simplifier import LexicalSimplifier


class GermanBertLexicalSimplifier(LexicalSimplifier):
    """
    A German BERT based implementation of lexical simplification. Masks the given complex word with [MASK], adds other
    BERT specific tokens and generates a list of possible substitutions via the model predictions based on the prompt.
    """

    def __init__(self, model, pattern, exemplars):
        super().__init__(model, pattern, exemplars)

    def generate_substitutions_for(self, complex_word):
        """Generates a list of substitutions via the model predictions for the given complex word."""
        raise NotImplementedError("Please implement this method in the subclass.")

In [13]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
model = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-german-cased")

Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# Input text with a [MASK] token
bert_pattern = '{original_sentence} Die einfachere Version der vorigen Satzes ist: {sentence_with_complex_word_masked}'
text = 'Der Fluss wurde begradigt um mehr Baufläche zu schaffen.'
text_masked = 'Der Fluss wurde [MASK] um mehr Baufläche zu schaffen.'

input_text = bert_pattern.format(original_sentence=text, sentence_with_complex_word_masked)

# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted probabilities for the masked token
masked_index = inputs["input_ids"].squeeze().tolist().index(tokenizer.mask_token_id)
probs = torch.nn.functional.softmax(outputs.logits[0, masked_index], dim=-1)

# Get the top predictions
top_k = 5
top_k_tokens = torch.topk(probs, k=top_k).indices.tolist()

# Convert token IDs back to tokens
predicted_tokens = [tokenizer.decode(token).strip() for token in top_k_tokens]

print("Predicted tokens:", predicted_tokens)

Predicted tokens: ['gebaut', 'ausgebaut', 'angelegt', 'verlegt', 'vergrößert']
