In [None]:
import os

import torch
from dotenv import load_dotenv
from peft import prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

load_dotenv()

if torch.cuda.is_available():
    device = "cuda"
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
else:
    device = "cpu"
    bnb_config = None
# from models import load_model, load_tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    os.getenv("LLM_MODEL_NAME"),
    token=os.getenv("HF_TOKEN"),
    device_map=device,
)
model = AutoModelForCausalLM.from_pretrained(
    os.getenv("LLM_MODEL_NAME"),
    token=os.getenv("HF_TOKEN"),
    quantization_config=bnb_config,
    device_map=device,
)
# tokenizer = load_tokenizer()
# model = load_model()

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:29<00:00, 14.82s/it]


In [2]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
import torch
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    # "gate_up_proj", "down_proj"
    target_modules=["qkv_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.unload()
model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 3670016 || all params: 2229079040 || trainable%: 0.16464270374190051


In [23]:
model = model.to(device)

In [17]:
import json

# Загрузка исходных данных
with open("training_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Формирование датасета
dataset = []
for i in range(0, len(data), 2):
    user_msg = data[i]
    assistant_msg = data[i + 1]

    entry = {"messages": [user_msg, assistant_msg]}
    # entry = [data[i], data[i+1]]
    dataset.append(entry)

# Пример: вывод первых 3 элементов
# for example in dataset[:3]:
#     print(json.dumps(example, indent=2, ensure_ascii=False))

with open("chatml_dataset.jsonl", "w", encoding="utf-8") as f:
    for entry in dataset:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")


In [18]:
from datasets import load_dataset

# Загрузка .jsonl
dataset = load_dataset("json", data_files="chatml_dataset.jsonl", split="train")

Generating train split: 60 examples [00:00, 6989.34 examples/s]


In [None]:
# def process_dataset(sample):
#     sample = tokenizer.apply_chat_template(
#         sample["messages"],
#         tokenize=True,
#         truncation=True,
#         padding="max_length",
#         max_length=512,
#     )

#     return {"text": sample}
print(dataset.data)
tokenized_dataset = dataset.map(
    lambda samples: tokenizer(samples["messages"]), batched=True
)
# tokenized_dataset = dataset.map(process_dataset, batched=True)

MemoryMappedTable
messages: list<item: struct<role: string, content: string>>
  child 0, item: struct<role: string, content: string>
      child 0, role: string
      child 1, content: string
----
messages: [[    -- is_valid: all not null
    -- child 0 type: string
["user","assistant"]
    -- child 1 type: string
["Category: positivity
Text: the weather is good tonight, but im too tired.","{
  "predicted_class": "low",
  "class_to_words": {
    "high": ["good"],
    "medium": ["tonight"],
    "low": ["tired", "but"]
  },
  "class_to_probabilities": {
    "high": 0.25,
    "medium": 0.25,
    "low": 0.5
  }
}"],    -- is_valid: all not null
    -- child 0 type: string
["user","assistant"]
    -- child 1 type: string
["Category: negativity
Text: I failed the exam and feel terrible.","{
  "predicted_class": "high",
  "class_to_words": {
    "high": ["failed", "terrible"],
    "medium": ["exam"],
    "low": ["feel"]
  },
  "class_to_probabilities": {
    "high": 0.6,
    "medium": 0.3,
  

Map:   0%|          | 0/60 [00:00<?, ? examples/s]


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [7]:
# Пример: вывод первых 3 элементов
for example in tokenized_dataset[:3]["messages"]:
    print(json.dumps(example, indent=2, ensure_ascii=False))

[
  {
    "role": "user",
    "content": "Category: positivity\nText: the weather is good tonight, but im too tired."
  },
  {
    "role": "assistant",
    "content": "{\n  \"predicted_class\": \"low\",\n  \"class_to_words\": {\n    \"high\": [\"good\"],\n    \"medium\": [\"tonight\"],\n    \"low\": [\"tired\", \"but\"]\n  },\n  \"class_to_probabilities\": {\n    \"high\": 0.25,\n    \"medium\": 0.25,\n    \"low\": 0.5\n  }\n}"
  }
]
[
  {
    "role": "user",
    "content": "Category: negativity\nText: I failed the exam and feel terrible."
  },
  {
    "role": "assistant",
    "content": "{\n  \"predicted_class\": \"high\",\n  \"class_to_words\": {\n    \"high\": [\"failed\", \"terrible\"],\n    \"medium\": [\"exam\"],\n    \"low\": [\"feel\"]\n  },\n  \"class_to_probabilities\": {\n    \"high\": 0.6,\n    \"medium\": 0.3,\n    \"low\": 0.1\n  }\n}"
  }
]
[
  {
    "role": "user",
    "content": "Category: excitement\nText: I can't wait for the concert! It's going to be amazing!"
  },


In [8]:
import torch

torch.cuda.is_available()

torch.cuda.empty_cache()

In [None]:
# accelerator = Accelerator()
# device_name = "cuda" if cuda.is_available() else "cpu"
# device = accelerator.prepare(device_name)

NameError: name 'Accelerator' is not defined

In [25]:
tokenized_dataset.to_pandas().to(dveice)

AttributeError: 'DataFrame' object has no attribute 'to'

In [20]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Настройки обучения
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    max_steps=10,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir="outputs",
    optim="paged_adamw_8bit",  # позволяет снизить нагрузку на gpu память и ускорить работу
    report_to="none",
    # remove_unused_columns=False,
)

# Инициализация Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["text"],
    processing_class=tokenizer,
    # tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
# Запуск обучения
trainer.train()

TypeError: device() received an invalid combination of arguments - got (NoneType), but expected one of:
 * (torch.device device)
      didn't match because some of the arguments have invalid types: (!NoneType!)
 * (str type, int index = -1)


In [16]:
# это работает
import os

import torch
from datasets import Dataset
from dotenv import load_dotenv
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

load_dotenv()
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    quantization_config=bnb_config,
    device_map="auto",
    token=os.getenv("HF_TOKEN"),
)

model = prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
# for name, module in model.named_modules():
#     print(name)
config = LoraConfig(
    r=8,
    lora_alpha=16,
    # "gate_up_proj", "down_proj"
    target_modules=["c_proj", "c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.unload()
model = get_peft_model(model, config)
# print_trainable_parameters(model)
tokenizer.pad_token = tokenizer.eos_token

texts = ["Hello world", "How are you?"]
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
dataset = Dataset.from_dict({"input_ids": inputs["input_ids"]})

# training_args = TrainingArguments(
#     output_dir="./outputs",
#     per_device_train_batch_size=1,
#     logging_steps=1,
#     max_steps=5,
#     warmup_steps=2,
#     learning_rate=2e-4,
#     fp16=True,
# )
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    max_steps=10,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir="outputs",
    optim="paged_adamw_8bit",  # позволяет снизить нагрузку на gpu память и ускорить работу
    report_to="none",
    # remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

trainer.train()


NameError: name 'tokenized_dataset' is not defined