In [None]:
# !pip install -q bitsandbytes trl sacrebleu wandb

In [None]:
data_dir = "data/en-et.tmx"
source_lang = "English"
target_lang = "Estonian"

local_dir = "./EU_scientific_corpus"
cache_dir = "./models"
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "google/gemma-2-2b"
output_directory = "gemma-training"

**Download Dataset**

In [None]:
import os
import shutil
from huggingface_hub import snapshot_download

dataset_repo = "FrancophonIA/EU_scientific_corpus"

snapshot_path = snapshot_download(repo_id=dataset_repo, repo_type="dataset")
shutil.copytree(snapshot_path, local_dir, dirs_exist_ok=True)

Fetching 41 files:   0%|          | 0/41 [00:00<?, ?it/s]

'./EU_scientific_corpus'

In [None]:
import torch
from transformers import BitsAndBytesConfig

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

**Training**

In [3]:
import os
import xml.etree.ElementTree as ET


def parse_tmx(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    tu_list = []
    for tu in root.iter("tu"):
        segs = [tuv.find("seg").text for tuv in tu.findall("tuv")]
        tu_list.append(tuple(segs))
    return tu_list


data_path = os.path.join(local_dir, data_dir)
segments = parse_tmx(data_path)

source_sents = [seg[0] for seg in segments]
target_sents = [seg[1] for seg in segments]

In [None]:
test_cutoff = int(0.9 * len(source_sents))

source_test = source_sents[test_cutoff:]
target_test = target_sents[test_cutoff:]

source_sents = source_sents[:test_cutoff]
target_sents = target_sents[:test_cutoff]

In [5]:
def create_prompts(source_lang, target_lang, source_sents, target_sents):
    prompts = []
    for source, target in zip(source_sents, target_sents):
        source = source_lang + ": " + source
        target = target_lang + ": " + target
        prompt = source + "\n" + target
        prompts.append(prompt)
    return prompts

In [6]:
import random

prompts = create_prompts(source_lang, target_lang, source_sents, target_sents)
random.shuffle(prompts)
print("Num Prompts: ", len(prompts))
print(prompts[0], "\n")
print(prompts[-1])

Num Prompts:  2239
English: MSA registers and databases.
Estonian: SOM registrid ja andmekogud. 

English: Financing from the budget of activity 4.1.
Estonian: Rahastamine tagatakse tegevus 4.1. eelarvest.


In [7]:
from datasets import Dataset, DatasetDict

train_cutoff = int(0.8 * len(prompts))

dataset = DatasetDict(
    {
        "train": Dataset.from_dict({"text": prompts[:train_cutoff]}),
        "validation": Dataset.from_dict({"text": prompts[train_cutoff:]}),
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1791
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 448
    })
})

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=nf4_config,
    use_cache=False,
    cache_dir=cache_dir,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name, cache_dir=cache_dir, add_bos_token=True, add_eos_token=False
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=128, lora_dropout=0.05, r=32, bias="none", task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [11]:
import wandb

wandb.init(
    project="llm-finetuning-translator",
    name="mistral-7b-finetuning",
    config={
        "learning_rate": 5e-5,
        "epochs": 5,
        "batch_size": 4,
        "model": "mistral-7b",
        "dataset_size": len(dataset),
    },
)

wandb: Currently logged in as: lucas-granucci (lucas-granucci-minnetonka-high-school) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


In [None]:
from trl import SFTTrainer, SFTConfig


max_seq_length = 512
model.gradient_checkpointing_enable()


training_args = SFTConfig(
    output_dir=output_directory,
    max_seq_length=max_seq_length,
    dataset_text_field="text",
    packing=True,
    # Training schedule
    num_train_epochs=15,
    # Batch sizes
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,  # maintain effective batch sizes
    # Learning rates
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    # Regularization
    weight_decay=0.001,
    max_grad_norm=0.5,
    # Evaluation and saving
    eval_strategy="steps",
    eval_steps=50,  # More frequent evaluation
    save_strategy="steps",
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # Logging
    report_to="wandb",
    logging_steps=10,
    disable_tqdm=False,
    # Mixed precision
    bf16=True,
    dataloader_pin_memory=False,  # May help with memory on mobile GPU
)


trainer = SFTTrainer(
    model=model,
    args=training_args,
    peft_config=peft_config,
    processing_class=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

Converting train dataset to ChatML:   0%|          | 0/1791 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1791 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1791 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/1791 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/448 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/448 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/448 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/448 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
trainer.train()



Step,Training Loss,Validation Loss
50,2.3431,2.287731
100,2.0701,2.099401
150,1.973,2.039675
200,1.9244,2.010251
250,1.7984,2.000989
300,1.7481,1.994575
350,1.6897,1.994583
400,1.6146,2.00653
450,1.6516,2.00906
500,1.6429,2.01176


TrainOutput(global_step=585, training_loss=1.841666166191427, metrics={'train_runtime': 7644.9546, 'train_samples_per_second': 0.6, 'train_steps_per_second': 0.077, 'total_flos': 1.0064814151827456e+17, 'train_loss': 1.841666166191427})

In [14]:
import json

logs = trainer.state.log_history
logs_path = os.path.join(output_directory, "logs.json")

with open(logs_path, "w") as log:
    log.write(json.dumps(logs, indent=2))

**Inference**

In [None]:
import os
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_path = os.path.join(output_directory, "checkpoint-585")
model_base = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
    device_map="auto",
    cache_dir=cache_dir,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    add_bos_token=True,
    add_eos_token=False,  # always false for inference
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

new_model = PeftModel.from_pretrained(
    model_base, peft_model_path, is_trainable=True  # config=peft_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def generate_response(prompt, model):
    encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    model_inputs = encoded_input.to("cuda")

    model.eval()
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=100,
        min_new_tokens=1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded_output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return decoded_output[0].replace(prompt, "")


def generate_batch_response(prompts, model):

    encoded_inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        add_special_tokens=True,
    )
    model_inputs = encoded_inputs.to("cuda")

    model.eval()
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=100,
        min_new_tokens=1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    predictions = []
    for prompt, prediction in zip(prompts, decoded_outputs):
        predictions.append(prediction.replace(prompt, ""))
    return predictions

In [17]:
prompt = """English: She had never seen such a beautiful sunset before, with colors ranging from deep orange to soft lavender.
Estonian:"""

generate_response(prompt, new_model)

' Sellepärast ei olnud kui kaunisel aeguväljakul, mis on kõrge oranssi ja lõõpõhja.'

**Evaluation**

In [41]:
def create_eval_prompts(source_lang, target_lang, source_sents):
    prompts = []
    for source in source_sents:
        source = source_lang + ": " + source
        target = target_lang + ": "
        prompt = source + "\n" + target
        prompts.append(prompt)
    return prompts


eval_prompts = create_eval_prompts(source_lang, target_lang, source_test)
eval_prompts = eval_prompts[:5]
predictions = generate_batch_response(eval_prompts, new_model)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
