# [English-to-Hungarian translator LLM](https://github.com/MartinKondor/EnglishToHungarianLLM)

In [1]:
# @markdown # Settings
model_id = "openlm-research/open_llama_7b"
context_size = 2048

# Setup

In [2]:
!pip install -q transformers sentencepiece treelib trl trl[peft] bitsandbytes loralib accelerate
!pip install -q --upgrade git+https://github.com/trisongz/lazyops
!pip install -q git+https://github.com/MartinKondor/jsonl.git
!pip install -q --upgrade typing-extensions

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.3 MB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m1.0/1.3 MB[0m [31m13.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K 

In [3]:
import locale
import gc

import torch

gc.collect()
torch.cuda.empty_cache()

locale.getpreferredencoding = lambda: "UTF-8"
hf_token = "..."
!huggingface-cli login --token $hf_token

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Model

In [4]:
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM
from accelerate import Accelerator


ModelClass = LlamaForCausalLM if "llama" in model_id else AutoModelForCausalLM
TokenizerClass = LlamaTokenizer if "llama" in model_id else AutoTokenizer

model: ModelClass = ModelClass.from_pretrained(
    model_id,
    # torch_dtype=torch.float16,
    # device_map="auto"
    device_map={"": Accelerator().process_index},
    load_in_8bit=True,
)
tokenizer: TokenizerClass = TokenizerClass.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/534k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Dataset

In [5]:
from typing import List, Dict
from jsonl import jsonl
from lazyops.utils import logger


!git clone https://github.com/MartinKondor/EnglishToHungarianLLM.git
dataset: List[Dict[str, str]] = jsonl.load("EnglishToHungarianLLM/data/data.jsonl")
logger.info(f"{len(dataset)} samples are loaded")

Cloning into 'EnglishToHungarianLLM'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 27 (delta 7), reused 7 (delta 0), pack-reused 0[K
Receiving objects: 100% (27/27), 22.13 MiB | 11.44 MiB/s, done.
Resolving deltas: 100% (7/7), done.
[1mINFO    [0m [32m2023-11-21 10:39:46.123[0m: [36m__main__[0m:[36m<cell line: 8>[0m: [1m39 samples are loaded[0m


In [6]:
from tqdm import tqdm


inst_prompt_template = """
Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
You are an expert English-Hungarian translator. Translate the English sentences in the Input to Hungarian.

### Input:
{input}

### Response:
""".strip()
train_samples = []

for data in tqdm(dataset):
  input = inst_prompt_template.format(input=data["en"])
  output = data["hu"]
  train_samples.append({
      "input": input,
      "output": output
  })

100%|██████████| 39/39 [00:00<00:00, 146443.92it/s]


In [7]:
test_samples = [
    {
        "input": inst_prompt_template.format(input="Senki se tudta, hová lett, mit müvelt."),
        "output": "No one knew where he had gone or what he had done."
    },
    {
        "input": inst_prompt_template.format(input="Csak gyanitják a később történtekből."),
        "output": "They only suspected from what happened."
    },
    {
        "input": inst_prompt_template.format(input="Ezt onnan gyanitják, mert délután két óra tájban szörnyü tragikus eset történt ezen a helyen."),
        "output": "This is suspected because at about two o'clock in the afternoon a terrible tragedy occurred in this place."
    },
]

In [8]:
import json


with open("test.json", "w+") as file:
  json.dump(test_samples, file)

with open("train.json", "w+") as file:
  json.dump(train_samples, file)

# Training

In [9]:
# @markdown ## Test the model before training
output_ids: torch.Tensor = tokenizer(
    test_samples[0]["output"],
    return_tensors="pt"
).input_ids
max_new_tokens: int = 2 * output_ids.size(dim=1)

prompt_ids: torch.Tensor = tokenizer(
    test_samples[0]["input"],
    return_tensors="pt"
).input_ids

output_ids: torch.Tensor = model.generate(
    input_ids=prompt_ids.to("cuda"),
    max_new_tokens=max_new_tokens
)
output_text: str = tokenizer.decode(output_ids[0])
print(output_text)

<s>Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
You are an expert English-Hungarian translator. Translate the English sentences in the Input to Hungarian.

### Input:
Senki se tudta, hová lett, mit müvelt.

### Response:
Személyesen nem ismeri, hová lett, mit müvelt.

### Expl


In [10]:
# Determine max_seq_length
max_seq_length: int = 1024
for sample in train_samples + test_samples:
  text = sample["input"] + sample["output"]
  tokens = tokenizer(text, return_tensors="pt").input_ids
  s = tokens.size(dim=1)

  if s > max_seq_length:
    max_seq_length = s

logger.info(f"max_seq_length: {max_seq_length}")

[1mINFO    [0m [32m2023-11-21 10:40:00.356[0m: [36m__main__[0m:[36m<cell line: 11>[0m: [1mmax_seq_length: 1081[0m


In [11]:
from accelerate import Accelerator
from transformers import BitsAndBytesConfig, GenerationConfig, TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig


target_modules = ["q_proj", "v_proj"]
# target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

output_dir = "./checkpoints"

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

training_args = TrainingArguments(
    output_dir=output_dir,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    max_steps=1500,
    eval_steps=500,
    save_steps=500,
    logging_steps=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    fp16=True,
    bf16=False,
    weight_decay=0.05,
    run_name="llm_en2hu",
    report_to=None,
    ddp_find_unused_parameters=False,
)



In [12]:
def formatting_prompts_func(example):
  return example["input"] + example["output"]


trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_samples,
    eval_dataset=test_samples,
    formatting_func=formatting_prompts_func,
    peft_config=lora_config,
    packing=True,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length
)

logger.info("Training...")
trainer.train()

logger.info("Saving last checkpoint of the model")
trainer.model.save_pretrained(os.path.join(output_dir, "final_checkpoint/"))



[1mINFO    [0m [32m2023-11-21 10:40:01.205[0m: [36m__main__[0m:[36m<cell line: 17>[0m: [1mTraining...[0m




OutOfMemoryError: ignored

In [None]:
import os


model.eval()
validation_file_name: str = "validation.jsonl"
if os.path.isfile(validation_file_name):
  os.remove(validation_file_name)

for test_sample in tqdm(test_samples):
  model_input = tokenizer(test_sample["input"], return_tensors="pt").to("cuda")
  output = model.generate(
      model_input.input_ids,
      max_new_tokens=max_seq_length,
      do_sample=True,
      top_p=1.0,
      temperature=1.0,
      top_k=200,
  )
  predicted: str = tokenizer.decode(output[0], skip_special_tokens=True)

  jsonl.append({
      "input": test_sample["input"],
      "output": test_sample["output"],
      "predicted": predicted,
  }, )