In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install transformers==4.42.3



In [3]:
import sys
sys.path.append('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification')
sys.path.append('/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification/src')

In [4]:
from llm_lexical_simplifier import LLMLexicalSimplifier
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForMaskedLM, pipeline
from simple_bert_lexical_simplifier import SimpleBertLexicalSimplifier
from src.utils.germaneval_data_provider import GermanEvalDataProvider
from language import Language
from datasets import Dataset

Using cuda for model inference.


In [5]:
torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    torch_dtype="auto",
    trust_remote_code=True,
    # Flash attention is only supported starting Ampere architecture
    # So we cant use it on free-tier GPUs
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# Define PAD Token = EOS Token
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [8]:
de_messages = [
    {"role": "user", "content": "Der Maschinenbau hat in Europa durch die Bildung der EU eine starke Erleichterung erhalten. Die vereinfachte Version des vorigen Satzes ist: Der Maschinenbau hat in Europa durch die Bildung der EU eine starke [MASK] erhalten. Results as ordered, AST parsable Python list:"},
    {"role": "assistant", "content": '["Vereinfachung", "Entspannung", "Begünstigung", "Unterstützung", "Rückenwind,  "Förderung", "Rückhalt", "Aufwind", "Antrieb", "Erleichterung"]'},
    {"role": "user", "content": "Die EU-Renaturierungsverordnung galt eigentlich bereits als durchgebracht; das EU-Parlament hatte zugestimmt und die Staaten ihren grundsätzlichen Sanktus signalisiert. Die vereinfachte Version des vorigen Satzes ist: Die EU-Renaturierungsverordnung galt eigentlich bereits als durchgebracht; das EU-Parlament hatte zugestimmt und die Staaten ihren grundsätzlichen [MASK] signalisiert. Ten results as ordered, AST parsable Python list:"},
    {"role": "assistant", "content": '["Zustimmung", "Einverständnis", "Okay", "Segen", "Billigung, Bestätigung, Bejahung, Zuspruch, Akzeptanz, Befürwortung"]'},
]
en_messages = [
    {"role": "user", "content": "The president’s relatives were also said to be critical of the way his closest advisers had prepared him for the debate. The simplified version of the previous sentence is: The president’s relatives were also said to be critical of the way his closest [MASK] had prepared him for the debate. Results as ordered, AST parsable Python list:"},
    {"role": "assistant", "content": '["staff", "team", "aides", "consultants", "assistants", "strategists", "supporters", "handlers", "counselors", "staffers"]'},
    {"role": "user", "content": "The intermediate sprint will come 94km or so into the stage. The simplified version of the previous sentence is: The intermediate sprint will come 94km or so into the [MASK]. Results as ordered, AST parsable Python list:"},
    {"role": "assistant", "content": '["segment", "part", "section", "phase", "leg", "interval", "course", "route", "portion", "strech"]'},
]

llm_ls = LLMLexicalSimplifier(model=model, tokenizer=tokenizer,
                              pattern='{original_sentence}. Die vereinfachte Version des vorigen Satzes ist: {sentence_with_complex_word_masked}. Ten results as ordered, AST parsable Python list:',
                              exemplars=de_messages, mask_token='[MASK]', generation_args=None)

Using cuda for model inference.
Using mask token: "[MASK]".


In [9]:
provider = GermanEvalDataProvider()
data = provider.provide_data_as_numpy_array()

In [10]:
def prepare_input(sample):
  complex_word = sample[1]
  original_sentence = sample[0]
  if complex_word not in original_sentence:
    # This covers the edge case of the complex word being the first word in the sentence
    complex_word = complex_word.capitalize() if complex_word.capitalize() in original_sentence else complex_word
  sentence_with_complex_word_masked = original_sentence.replace(complex_word, llm_ls.mask_token)

  input_text = llm_ls.apply_pattern_to(original_sentence, sentence_with_complex_word_masked)

  return llm_ls.exemplars + [{'role': 'user', 'content': input_text}]

In [35]:
data = ['foo', 'prompting', 'hahah', 'sadness sucks']

In [19]:
benchmark_dataset = Dataset.from_list([{'sample': llm_ls.tokenizer.apply_chat_template(prepare_input(sample), tokenize=False)} for sample in data[:1]])

In [21]:
benchmark_dataset['sample'][0]

'Der Maschinenbau hat in Europa durch die Bildung der EU eine starke Erleichterung erhalten. Die vereinfachte Version des vorigen Satzes ist: Der Maschinenbau hat in Europa durch die Bildung der EU eine starke [MASK] erhalten. Results as ordered, AST parsable Python list:<|endoftext|>["Vereinfachung", "Entspannung", "Begünstigung", "Unterstützung", "Rückenwind,  "Förderung", "Rückhalt", "Aufwind", "Antrieb", "Erleichterung"]<|endoftext|>Die EU-Renaturierungsverordnung galt eigentlich bereits als durchgebracht; das EU-Parlament hatte zugestimmt und die Staaten ihren grundsätzlichen Sanktus signalisiert. Die vereinfachte Version des vorigen Satzes ist: Die EU-Renaturierungsverordnung galt eigentlich bereits als durchgebracht; das EU-Parlament hatte zugestimmt und die Staaten ihren grundsätzlichen [MASK] signalisiert. Ten results as ordered, AST parsable Python list:<|endoftext|>["Zustimmung", "Einverständnis", "Okay", "Segen", "Billigung, Bestätigung, Bejahung, Zuspruch, Akzeptanz, Befür

In [20]:
pipe = pipeline(task='text-generation',
                model=model,
                tokenizer=tokenizer,
                batch_size=1,
                padding=True,
                max_new_tokens=100,
                device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                return_full_text=False,
                do_sample=False)

In [21]:
benchmark_dataset = benchmark_dataset.map(lambda x: {'generated_text': pipe(x['sample'][0])}, batched=True, batch_size=1)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [22]:
benchmark_dataset['generated_text']

[{'generated_text': " \n I need a C++ class for a 3D vector with floating-point numbers. It should have a constructor, a copy constructor, and a destructor. Include methods for setting and getting the x, y, and z components. Add a method to calculate the vector's length, and another to normalize it. Implement a method to calculate the dot product with another vector. Also, include a method to calculate the angle between two vectors in radians. Add"}]