# LLM

In [1]:
import os
from dotenv import load_dotenv


load_dotenv()

token = os.getenv("HF_TOKEN")

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from peft import prepare_model_for_kbit_training


def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable


def load_model(model_name):
    bnb_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True, 
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(model_name, token=token, quantization_config=bnb_cfg)
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    model.to("cuda:0")

    tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

    return tokenizer, model

  from .autonotebook import tqdm as notebook_tqdm


## Fine-tuning на учебнике

In [3]:
tokenizer, model = load_model(model_name)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.34s/it]


In [None]:
from datasets import load_from_disk

ds = load_from_disk("../../data/textbook_dataset")
ds

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], max_length=2048)


ds = ds.map(tokenize, batched=True)
ds

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 210
})

In [4]:
from peft import LoraConfig, get_peft_model


config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

lora_total, lora_trainable = count_parameters(model)
print(f"[LoRA Model] Total: {lora_total:,}, Trainable: {lora_trainable:,}")

[LoRA Model] Total: 3,843,428,352, Trainable: 85,065,728


In [None]:
from transformers import TrainingArguments


models_dir = "../../models/"
args = TrainingArguments(
    output_dir=models_dir,
    num_train_epochs=4, # replace this, depending on your dataset
    per_device_train_batch_size=1,
    learning_rate=1e-5,
    optim="adamw_torch",
)

In [6]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=ds,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
500,1.477


  return fn(*args, **kwargs)


TrainOutput(global_step=840, training_loss=1.3055755251929873, metrics={'train_runtime': 1631.0582, 'train_samples_per_second': 0.515, 'train_steps_per_second': 0.515, 'total_flos': 1.904544975991603e+16, 'train_loss': 1.3055755251929873})

In [7]:
# Сохраняем только LoRA адаптеры (весит очень мало)
peft_output_dir = os.path.join(models_dir, "mistral-psy-lora")
model.save_pretrained(peft_output_dir)
tokenizer.save_pretrained(peft_output_dir)



('../../models/mistral-psy-lora/tokenizer_config.json',
 '../../models/mistral-psy-lora/special_tokens_map.json',
 '../../models/mistral-psy-lora/tokenizer.model',
 '../../models/mistral-psy-lora/added_tokens.json',
 '../../models/mistral-psy-lora/tokenizer.json')

## Fine-tuning на датасетах HF

In [None]:
from peft import PeftModel

tokenizer, model = load_model(model_name)

base_total, base_trainable = count_parameters(model)
print(f"[Base Model] Total: {base_total:,}, Trainable: {base_trainable:,}")

model = PeftModel.from_pretrained(model, "../../models/textbook/mistral-psy-lora", is_trainable=True)

lora_total, lora_trainable = count_parameters(model)
print(f"[LoRA Model] Total: {lora_total:,}, Trainable: {lora_trainable:,}")

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.34s/it]


[Base Model] Total: 3,758,362,624, Trainable: 0
[LoRA Model] Total: 3,843,428,352, Trainable: 0


In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import load_from_disk

models_dir = "../../models/"
data_dir = "../../data/datasets_cleaned/"
for ds_name in [
    "Amod-mental_health_counseling_conversations", "entfane-psychotherapy", "mrs83-kurtis_mental_health_final",
    "tcabanski-mental_health_counseling_responses", "ShenLab-MentalChat16K", "Psychotherapy-LLM-PsychoCounsel-Preference"
]:
    export_path = models_dir + ds_name + "/"
    
    ds_dct = load_from_disk(data_dir + ds_name)
    
    training_args = TrainingArguments(
        output_dir=export_path,
        per_device_train_batch_size=1,
        num_train_epochs=2,
        learning_rate=2e-5,
        logging_steps=500,
        save_steps=len(ds_dct["train"]) // 3,
        optim="adamw_torch",
    )
    trainer = SFTTrainer(
        model=model,
        train_dataset=ds_dct["train"],
        eval_dataset=ds_dct["test"],
        args=training_args,
    )
    trainer.train()
    
    torch.cuda.empty_cache()

    training_args = TrainingArguments(
        output_dir=export_path + "test",
        per_device_train_batch_size=1,
        num_train_epochs=2,
        learning_rate=1e-5,
        logging_steps=500,
        save_steps=len(ds_dct["test"]) // 3,
        optim="adamw_torch",
    )
    trainer = SFTTrainer(
        model=model,
        train_dataset=ds_dct["test"],
        args=training_args,
    )
    trainer.train()

    model.save_pretrained(export_path)

    torch.cuda.empty_cache()

model.save_pretrained(models_dir)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


## Загрузка модели

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch

# === 1. Настройки 4bit загрузки ===
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# === 2. Пути ===
base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
peft_model_dir = "/mnt/sdb1/home/kygrachev/diploma/models/mistral-psy-lora"

# === 3. Загружаем токенизатор ===
tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=token, use_fast=True)

# === 4. Загружаем базовую модель в 4bit ===
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True,
    token=token,   # если нужен токен HuggingFace
)

# === 5. Накатываем LoRA ===
model = PeftModel.from_pretrained(base_model, peft_model_dir)

# === 6. Опционально: для стабильности в inference ===
model.eval()

Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.36s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro