In [1]:
import json

# Загрузка исходных данных
with open("training_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Формирование датасета
dataset = []
for i in range(0, len(data), 2):
    user_msg = data[i]
    assistant_msg = data[i + 1]

    entry = {"messages": [user_msg, assistant_msg]}
    # entry = [data[i], data[i+1]]
    dataset.append(entry)

# Пример: вывод первых 3 элементов
# for example in dataset[:3]:
#     print(json.dumps(example, indent=2, ensure_ascii=False))

with open("chatml_dataset.jsonl", "w", encoding="utf-8") as f:
    for entry in dataset:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")


In [2]:
from datasets import load_dataset

# Загрузка .jsonl
dataset = load_dataset("json", data_files="chatml_dataset.jsonl", split="train")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 60 examples [00:00, 2860.57 examples/s]


In [3]:
import json


def json_to_chatml(messages):
    chatml = []
    for message in messages:
        role = message["role"]
        content = message["content"]

        # Если это ассистент и content является JSON-строкой, форматируем его
        if role == "assistant":
            try:
                content_json = json.loads(content)
                # Красиво форматируем JSON для вывода
                formatted_content = json.dumps(content_json, indent=2)
            except json.JSONDecodeError:
                formatted_content = content
        else:
            formatted_content = content

        chatml.append(f"<|im_start|>{role}\n{formatted_content}<|im_end|>")

    return "\n".join(chatml)

In [4]:
def process_dataset(sample):
    sample = json_to_chatml(sample["messages"])
    return {"text": sample}


chat_dataset = dataset.map(process_dataset)

Map: 100%|██████████| 60/60 [00:00<00:00, 1331.54 examples/s]


In [5]:
# Пример: вывод первых 3 элементов
for example in chat_dataset[:3]["text"]:
    print(json.dumps(example, indent=2, ensure_ascii=False))

"<|im_start|>user\nCategory: positivity\nText: the weather is good tonight, but im too tired.<|im_end|>\n<|im_start|>assistant\n{\n  \"predicted_class\": \"low\",\n  \"class_to_words\": {\n    \"high\": [\n      \"good\"\n    ],\n    \"medium\": [\n      \"tonight\"\n    ],\n    \"low\": [\n      \"tired\",\n      \"but\"\n    ]\n  },\n  \"class_to_probabilities\": {\n    \"high\": 0.25,\n    \"medium\": 0.25,\n    \"low\": 0.5\n  }\n}<|im_end|>"
"<|im_start|>user\nCategory: negativity\nText: I failed the exam and feel terrible.<|im_end|>\n<|im_start|>assistant\n{\n  \"predicted_class\": \"high\",\n  \"class_to_words\": {\n    \"high\": [\n      \"failed\",\n      \"terrible\"\n    ],\n    \"medium\": [\n      \"exam\"\n    ],\n    \"low\": [\n      \"feel\"\n    ]\n  },\n  \"class_to_probabilities\": {\n    \"high\": 0.6,\n    \"medium\": 0.3,\n    \"low\": 0.1\n  }\n}<|im_end|>"
"<|im_start|>user\nCategory: excitement\nText: I can't wait for the concert! It's going to be amazing!<|im

In [None]:
import os

import torch
from dotenv import load_dotenv
from peft import prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

load_dotenv()

if torch.cuda.is_available():
    device = "cuda"
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
else:
    device = "cpu"
    bnb_config = None
    
tokenizer = AutoTokenizer.from_pretrained(
    os.getenv("LLM_MODEL_NAME"),
    token=os.getenv("HF_TOKEN"),
)
model = AutoModelForCausalLM.from_pretrained(
    os.getenv("LLM_MODEL_NAME"),
    token=os.getenv("HF_TOKEN"),
    quantization_config=bnb_config,
    device_map="auto",  # квантование требует автоматического распределения между cpu и gpu
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model.enable_input_require_grads()


Loading checkpoint shards: 100%|██████████| 2/2 [00:28<00:00, 14.09s/it]


In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["qkv_proj", "o_proj", "gate_up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
# model.unload()
model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 11534336 || all params: 2236943360 || trainable%: 0.5156293273335272


In [9]:
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

tokenizer.pad_token = tokenizer.eos_token  # особенность для GPT2Tokenizer

inputs = tokenizer(
    chat_dataset[:]["text"], return_tensors="pt", padding=True, truncation=True
)
train_dataset = Dataset.from_dict({"input_ids": inputs["input_ids"]})


training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,  # используем шаги разогрева для улучшения работы оптимизатора adam
    max_steps=10,
    learning_rate=2e-4,
    bf16=True,
    logging_steps=1,
    output_dir="outputs",
    optim="paged_adamw_8bit",  # позволяет снизить нагрузку на gpu память и ускорить работу
    report_to="mlflow",
    label_names=["input_ids", "attention_mask"],
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)


In [10]:
rules = [
    {
        "role": "system",
        "content": """You are an AI assistant that returns ONLY JSON answers. If you output anything but JSON you will have FAILED. Follow these rules:
        - Output only valid JSON.
        - JSON must include:
        - Classify only words/phrases that are related to the category.
        - predicted_class: the level ("high", "medium", or "low") with the highest probability among all levels.
        - class_to_words: a mapping of each level ("high", "medium", "low") to a list of words or phrases from the text.
        - class_to_probabilities: a mapping of each level ("high", "medium", "low") to its probability.
        - divide the words by their respective class and probability.
        - If word is neutral to a category(names, conjunctions etc.), predicted class should be "low".
        - Use the source language without reinterpretation.
        - One word/phrase can only be in one class. Don't repeat same words.
        - Do not mention or classify the provided context.
        - Avoid responses with any text outside json.
        Keep your answer concise.""",
    },
    {
        "role": "user",
        "content": """
        Category: positivity
        Text: the weather is good tonight, but im too tired.""",
    },
    {
        "role": "assistant",
        "content": """
    {
      "predicted_class": "medium"
      "class_to_words": {
        "high": ["good"],
        "medium": ["tonight", ""but""]
        "low": ["tired"],
      },
      "class_to_probabilities": {
        "high": 0.25,
        "medium": 0.5,
        "low": 0.25
        }
    }
    """,
    },
]

template = """
    Category: {category}
    Text: {text}
"""

In [11]:
import mlflow
import pydantic
from transformers import pipeline


class Request(pydantic.BaseModel):
    category: str
    text: str


def process_input(request, input: Request):
    query_template = template.format(category=input.category, text=input.text)
    request.append({"role": "user", "content": query_template})


class TextGenerator(mlflow.pyfunc.PythonModel):
    def __init__(self, generation_args):
        self.generation_args = generation_args

    def load_context(self, context):
        self.pipeline = pipeline(
            "text-generation",
            model=context.artifacts["model"],
            tokenizer=context.artifacts["tokenizer"],
        )

    def predict(self, context, model_input: list[Request]) -> list:
        results = []
        for input in model_input:
            request = rules
            process_input(request, input)

            results.append(
                self.pipeline(request, **self.generation_args)[0]["generated_text"]
            )
        return results

In [12]:
import mlflow

mlflow.tracking.fluent._tracking_uri = None
mlflow.set_tracking_uri("file:///mlruns")  # Локальная папка
mlflow.set_experiment("LoRA Fine-tuning")


<Experiment: artifact_location='file:c:/mlruns/645640183581576761', creation_time=1745075482500, experiment_id='645640183581576761', last_update_time=1745075482500, lifecycle_stage='active', name='LoRA Fine-tuning', tags={}>

In [None]:
import mlflow
import peft
import torch
import transformers

# можно настроить под postgresql: mlflow server --backend-store-uri="postgres://username@hostname:port/database" --default-artifact-root=s3://your-bucket --host=0.0.0.0 --port=5000
# или через sqlite mlflow server --backend-store-uri sqlite:///mydb.sqlite
# mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
# запускать через mlflow ui --backend-store-uri file:///mlruns


requirements = [
    f"torch=={torch.__version__}",
    f"transformers=={transformers.__version__}",
    f"peft=={peft.__version__}",
]

with mlflow.start_run():
    trainer.train()
    mlflow.log_artifacts("outputs", artifact_path="checkpoints")

    # Адаптер из последнего шага обучения
    model.load_adapter("./outputs/checkpoint-10", adapter_name="lora_adapter_base")
    # Объединяем LoRA с базовой моделью
    merged_model = model.merge_and_unload()
    # Сохраняем модель
    merged_model.save_pretrained("./model")
    tokenizer.save_pretrained("./tokenizer")

    generation_args = {
        "max_new_tokens": 200,
        "return_full_text": False,
        "do_sample": False,
    }

    # mlflow.log_metric("PPL", eval_results["eval_loss"])
    model_info = mlflow.pyfunc.log_model(
        artifact_path="model",
        artifacts={
            "model": "./model",
            "tokenizer": "./tokenizer",
        },
        python_model=TextGenerator(generation_args),
        pip_requirements=requirements,
    )

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,2.0029
2,2.0055
3,1.8792
4,1.5792
5,1.4714
6,1.343
7,1.2225
8,1.2019
9,1.1338
10,1.1574


2025/04/20 20:09:52 INFO mlflow.models.signature: Inferring model signature from type hints
2025/04/20 20:09:52 INFO mlflow.models.signature: Failed to infer output type hint, setting output schema to AnyType. Type hint `list` doesn't contain a collection element type. Fix by adding an element type to the collection type definition, e.g. `list[str]` instead of `list`.
Downloading artifacts: 100%|██████████| 3/3 [00:20<00:00,  6.76s/it]   
Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 54.44it/s]  


In [20]:
input_data = Request(
    category="Неопределённость",
    text="Неизвестность грядущего пугает людей",
)

In [None]:
import mlflow.pyfunc

# model_uri = "runs:/e9e2c229df784ab2a10007f290ba8e19/model"

model = mlflow.pyfunc.load_model(model_info.model_uri)

output = model.predict([input_data])
print(output[0])

In [None]:
from rouge import Rouge

# import math

# eval_results = trainer.evaluate()
# print("Perplexity:", round(math.exp(eval_results["eval_loss"]), 2))


rouge = Rouge()
scores = rouge.get_scores(output, "Что ждёт нас в будущем?", avg=True)

ModuleNotFoundError: No module named 'rouge'

In [None]:
import mlflow

model_uri = "runs:/e9e2c229df784ab2a10007f290ba8e19/model"


# Verify the model with the provided input data using the logged dependencies.
# For more details, refer to:
# https://mlflow.org/docs/latest/models.html#validate-models-before-deployment
mlflow.models.predict(
    model_uri=model_uri,
    input_data=[input_data],
    env_manager="uv",
)

Downloading artifacts: 100%|██████████| 14/14 [00:19<00:00,  1.39s/it]  
2025/04/20 19:29:38 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'
2025/04/20 19:29:38 INFO mlflow.utils.virtualenv: Creating a new environment in C:\Users\pafin\AppData\Local\Temp\tmpzjc8jy9l\envs\virtualenv_envs\mlflow-2e75ae3718e46febb49eb9d0be69455d33e39c31 with python version 3.13.2 using uv
2025/04/20 19:29:38 INFO mlflow.utils.virtualenv: Installing dependencies


ShellCommandException: Non-zero exit code: 1
Command: ['cmd', '/c', 'C:\\Users\\pafin\\AppData\\Local\\Temp\\tmpzjc8jy9l\\envs\\virtualenv_envs\\mlflow-2e75ae3718e46febb49eb9d0be69455d33e39c31\\Scripts\\activate.bat & uv pip install --prerelease=allow -r requirements.b493f1d856bd4773a5e9f57f55c101c5.txt']

In [None]:
import mlflow
import pandas as pd

logged_model = "runs:/f7f6514ad4394e32a32a6dca3dcdb9ea/merged_model"

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

loaded_model.predict(pd.DataFrame(input_data))

In [None]:
# Получить список всех экспериментов (включая удалённые)
all_experiments = mlflow.search_experiments(view_type=mlflow.entities.ViewType.ALL)

# Вывести ID и имена
for exp in all_experiments:
    print(
        f"ID: {exp.experiment_id}, Name: {exp.name}, Lifecycle: {exp.lifecycle_stage}"
    )

In [None]:
mlflow.restore_experiment(experiment_id="522564765305824673")

In [None]:
exp = mlflow.get_experiment_by_name("LoRA Fine-tuning")
if exp:
    mlflow.delete_experiment(exp.experiment_id)  # Полное удаление