In [None]:
# 
#  ТРЕНИРОВКА ДАТАСЕТА
# 
# 
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import Dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
import json
import torch
import os
from peft import LoraConfig

os.environ["HF_TOKEN"] = "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"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

with open("dataset/processed_dataset.json", "r", encoding="utf-8") as f:
    Vfile = json.load(f)
    
dataset = [entry for entry in Vfile if isinstance(entry["previous_message"], str) and isinstance(entry["next_message"], str)]
dataset = Dataset.from_list(dataset)

print(dataset)

model = AutoModelForCausalLM.from_pretrained("tinkoff-ai/ruDialoGPT-medium", device_map="auto") 
tokenizer = AutoTokenizer.from_pretrained("tinkoff-ai/ruDialoGPT-medium")
print(model)


In [None]:
model = model.to("mps")

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model 

target_modules = ['c_proj','c_attn', 'c_fc']

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
dataset = Dataset.from_dict(dataset[:])
def formatting_prompts_func(example):
    text = f"@@ПЕРВЫЙ@@ {example['previous_message']} @@ВТОРОЙ@@ {example['next_message']} @@ПЕРВЫЙ@@"
    return {"text": text}

dataset = dataset.map(formatting_prompts_func)
dataset = Dataset.from_dict(dataset[:])

In [None]:
response_template = "@@ВТОРОЙ@@"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir="./output/models3/saves/",          # Директория для сохранения модели
    logging_dir='./logs',            # Директория для логов
    save_strategy="epoch",           # Сохранять модель после каждой эпохи
    per_device_train_batch_size=2,   # Размер батча на устройстве
    num_train_epochs=3,              # Количество эпох тренировки
    save_total_limit=2,              # Максимальное количество сохранений модели
    logging_steps=300,                # Частота логгирования
    use_mps_device=True,
)
trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=training_args,
    dataset_text_field="text",
    data_collator=collator,
    peft_config=config
)

trainer.train()


In [None]:
model.save_pretrained("output_dir3")

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "output_dir3"
config = PeftConfig.from_pretrained(peft_model_id)
model2 = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model3 = PeftModel.from_pretrained(model2, peft_model_id)

In [None]:
def generate_response(prompt, model, tokenizer):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to("mps")
    attention_mask = ~inputs.eq(tokenizer.pad_token_id).to("mps")
    outputs = model.generate(
    inputs,
    top_k=10,
    top_p=0.95,
    num_beams=3,
    num_return_sequences=3,
    do_sample=True,
    no_repeat_ngram_size=2,
    temperature=1.2,
    repetition_penalty=1.3,
    length_penalty=1.0,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=50257,
    max_new_tokens=60,
    attention_mask=attention_mask
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response



print('\n\n', generate_response(" @@ПЕРВЫЙ@@ Давай тогда я к тебе, по протеину и в зал @@ВТОРОЙ@@ Ок @@ПЕРВЫЙ@@ Ну я у тебя, ты где @@ВТОРОЙ@@", model3, tokenizer))