# LLM Adaptation on Russian Language

### Dependencies

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
# https://github.com/git-lfs/git-lfs/blob/main/INSTALLING.md
!git clone https://huggingface.co/unsloth/Qwen2.5-1.5B

In [None]:
!git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
!cd lm-evaluation-harness && pip install -e .

In [None]:
!git clone https://github.com/ggml-org/llama.cpp
!cd llama.cpp && cmake -B build && cmake --build build --config Release

In [None]:
!pip install -U mistral_common sentencepiece -q

In [None]:
import torch
import openai
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

### Base model 

#### FastLanguageModel 

In [None]:
max_seq_length = 1280
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Qwen2.5-1.5B',
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)

#### Base model inference

In [None]:
FastLanguageModel.for_inference(model)
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B")

def generate_answer(
    prompt: str,
    apply_chat_template: bool = False,
    max_new_tokens: int = 32
):
    if apply_chat_template:
        messages = [
            {"role": "user", "content": prompt}
        ]
        prompt = qwen_tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )

    inputs = qwen_tokenizer(
        prompt,
        return_tensors='pt'
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    generated = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        use_cache=True,
        pad_token_id=qwen_tokenizer.eos_token_id,
    )
    return qwen_tokenizer.decode(generated[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)

In [None]:
print(generate_answer("Расскажи мне про Булгакова", apply_chat_template=False, max_new_tokens=128))
print('--'*100)
print()

print(generate_answer("Tell me about Bulgakov", apply_chat_template=False, max_new_tokens=128))
print('--'*100)
print(generate_answer("Tell me about Bulgakov", apply_chat_template=True, max_new_tokens=128))
print()

print(generate_answer("請講講布爾加科夫", apply_chat_template=False, max_new_tokens=128))
print('--'*100)
print(generate_answer("請講講布爾加科夫", apply_chat_template=True, max_new_tokens=128))

In [None]:
prompts_bucket = [
    'Если бы ты проектировал летательный аппарат будущего то какие возможности ты бы в него заложил?',
    'Чем отличаются технологии mini grid, micro grid и smart grid в электроэнергетике?',
    'Какие существуют методы реагирования на риски?',
    'Какое еще планирование кроме стратегического бывает?',
    'Расскажи как профессионал в области фармаконадзора что такое план управления рисками?',
    'Корзина без дна стоит в коробке, которая стоит на земле. Я кладу в корзину три яблока и ставлю ее на стол. Где яблоки?',
    'Какие закономерности ты видишь в строке "19, 58, 29, 88, 44, 22, 11, 34, 17, 52, 26, 13, 40, 20, 10, 5, 16, 8, 4, 2, 1"?',
    'Что нужно сделать для того, что посчитать сколько букв r в слове strawberry?',
    'Где лето теплое, но зима суровая, преобладают хвойные растения, животный мир разнообразен?',
    'Принимая на себя роль возможного ASI, важно ли было сообщество, связанное общим духовным опытом?'
]

In [None]:
results_bucket = [generate_answer(prompt, True) for prompt in prompts_bucket]

#### Bechmarking with LM Evaluation Harness

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
%%writefile ./data/llm_adaptation_files/run_lmeh.sh
lm-eval \
  --model hf \
  --model_args pretrained=unsloth/Qwen2.5-1.5B,trust_remote_code=True,load_in_4bit=True,bnb_4bit_compute_dtype=bfloat16,max_length=2048 \
  --tasks truthfulqa_ru_mc1,piqa,winogrande \
  --device cuda:0 \
  --batch_size auto \
  --num_fewshot 0 \
  --limit 500 \
  --output_path results_qwen25_4bit.json

In [None]:
!bash ./data/llm_adaptation_files/run_lmeh.sh

**Результаты замеров:**  
piqa: 0.738  
truthfulqa_ru_mc1: 0.296  
winogrande: 0.618

### New model

#### Data Preparation

Будем использовать шаблон

```text
<|im_start|>user
Hi there!<|im_end|>
<|im_start|>assistant
Nice to meet you!<|im_end|>
<|im_start|>user
Can I ask a question?<|im_end|>
```

https://huggingface.co/docs/transformers/en/chat_templating

In [None]:
def formatting_func(example):
    parts = []
    for turn in example["conversation"]:
        role = turn["role"]
        content = turn["content"].strip()
        if role == "user":
            parts.append("<|im_start|>user")
            parts.append(f"{content}<|im_end|>")
        elif role == "assistant":
            parts.append("<|im_start|>assistant")
            parts.append(f"{content}<|im_end|>")
    return {"text": "\n".join(parts) + "\n"}

In [None]:
vikhr_dataset = load_dataset("Vikhrmodels/GrandMaster-PRO-MAX", split="train")
vikhr_dataset = vikhr_dataset.map(formatting_func, batched=False)

#### Model Training
##### QLoRA 
[TRL SFT Trainer](https://huggingface.co/docs/trl/sft_trainer)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=8,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=False,
    use_rslora = False,
    loftq_config = None
)

##### Training

In [None]:
m = model
setattr(m, "_flag_for_generation", True)
while hasattr(m, "model"):
    m = m.model
    setattr(m, "_flag_for_generation", True)

In [None]:
sft_cfg = SFTConfig(
    output_dir = "qwen-qlora-sft",
    num_train_epochs = 1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    optim = "paged_adamw_8bit",
    weight_decay = 0.01,
    fp16 = True,
    learning_rate = 2e-3,
    logging_steps = 100,
    save_strategy = "epoch",
    warmup_ratio = 0.1,
    warmup_steps=30,
    lr_scheduler_type = "linear",
    report_to = "none",
    dataloader_num_workers = 8,
    dataloader_pin_memory = True,
    dataloader_prefetch_factor = 4,
    group_by_length = True,
    max_steps = 30
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = vikhr_dataset,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True, # Can make training 5x faster for short sequences.
    args = sft_cfg
)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
model.save_pretrained("./data/llm_adaptation_files/lllora_model")
tokenizer.save_pretrained("./data/llm_adaptation_files/lora_model")

In [None]:
!tar -czvf ./data/llm_adaptation_files/lora_model.tar.gz ./data/llm_adaptation_files/lora_model

#### Inference

In [None]:
tokenizer.chat_template = """{%- for message in messages -%}
    {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
    {{- '<|im_start|>assistant\n' -}}
{%- endif -%}
"""

In [None]:
model.save_pretrained_gguf("./data/llm_adaptation_files/qwen25_15_ru_instruct", tokenizer)

##### Bechmarking with LM Evaluation Harness

In [None]:
%%writefile ./data/llm_adaptation_files/run_lmeh_new.sh
lm-eval \
  --model hf \
  --model_args pretrained=./data/llm_adaptation_files/qwen25_15_ru_instruct,trust_remote_code=True,load_in_4bit=True,bnb_4bit_compute_dtype=bfloat16,max_length=2048 \
  --tasks truthfulqa_ru_mc1,piqa,winogrande \
  --device cuda:0 \
  --batch_size auto \
  --num_fewshot 0 \
  --limit 500 \
  --output_path results_qwen25_custom.json

In [None]:
!bash ./data/llm_adaptation_files/run_lmeh_new.sh

##### llama.cpp inference
[llama.cpp](https://github.com/ggml-org/llama.cpp/tree/master?tab=readme-ov-file#quick-start)

##### Run server in terminal

cd ./llama.cpp/
./build/bin/llama-server \
  -m ./data/llm_adaptation_files/qwen25_15_ru_instruct/unsloth.Q8_0.gguf \
  --port 8000 \
  --host 0.0.0.0 \
  -c 2048 \
  -ngl 32

In [None]:
client = openai.Client(base_url="http://localhost:8000/v1", api_key="not-needed")

resp = client.chat.completions.create(
    model="qwen25_15_ru_instruct",
    messages=[{"role": "user", "content": "Расскажи мне про Булгакова"}],
    temperature=0.7,
    max_tokens=128,
)
resp.choices[0].message.content

##### Curl

!curl http://127.0.0.1:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer not-needed" \
  -d '{\
    "model": "qwen25_15_ru_instruct",\
    "messages": [{"role":"user","content":"Расскажи мне про Булгакова"}],\
    "temperature": 0.7,\
    "max_tokens": 128\
  }'

In [None]:
def generate_new_result(prompt):
    response = client.chat.completions.create(
        model="qwen25_15_ru_instruct",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=128,
    )
    return response.choices[0].message.content

In [None]:
results_bucket = [generate_new_result(prompt) for prompt in prompts_bucket]