In [6]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os
import pandas as pd
import numpy as np

In [2]:
# first mention dataset name
#dataset="burkelibbey/colors"
dataset="bentrevett/multi30k"
# then specify model, we use chat for shorter trainer time, we will try base later
model_id="google/gemma-2b"
# define output directory
#output_model="tinyllama-colorist-v1"
output_model="gemma-2b-multi30k-v1-en-ger"

In [22]:
from transformers import GenerationConfig
from time import perf_counter

def get_model_and_tokenizer(mode_id):
    # use tokenizer from llama2
    tokenizer = AutoTokenizer.from_pretrained('philschmid/gemma-tokenizer-chatml')
    tokenizer.pad_token = tokenizer.eos_token
    # uses bits and bytes package
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    # loads the model
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    # returns a model and tokenizer
    return model, tokenizer

def formatted_prompt(question)-> str:
    return f"Translate from English to German: {question}\n\nassistant:\n"

def generate_response(user_input):

  prompt = formatted_prompt(user_input)

  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=12,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()

  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(**inputs, generation_config=generation_config)
  output=tokenizer.decode(outputs[0], skip_special_tokens=True)
  #print(output)
  output_time = perf_counter() - start_time
  #print(f"Time taken for inference: {round(output_time,2)} seconds")
  return output

In [4]:
model, tokenizer = get_model_and_tokenizer(model_id)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
data_test = load_dataset(dataset, split="test")

In [32]:
input_en=[]
output_de=[]
gt_de=[]

for r in data_test:
  input_en.append(r["en"])
  out=generate_response(user_input=r["en"])
  out=str(out).split('assistant:')[-1].strip()
  #processed_out=re.sub("[^A-Za-z., ]","",out)
  output_de.append(out)
  gt_de.append(r["de"])

test_translation={
    'input_en':input_en,
    'output_de':output_de,
    "gt_de":gt_de
}
test_translation_df=pd.DataFrame(test_translation)
test_translation_df.to_csv('gemma_zeroshot_multi30k_en_ger.csv')

In [31]:
out=generate_response(user_input=data_test[0]['en'])
#out=out.split('assistant:')[-1].strip()
# processed_out=re.sub("[^A-Za-z0-9., ]","",out)
print(out)

Translate from English to German: A man in an orange hat starring at something.

assistant:
a woman who is a red
