In [1]:
import os
os.environ["HF_TOKEN"] = "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"


import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Пути и параметры
BASE_MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
CHECKPOINT_PATH = "./VectorSFT-checkpoints-upd/checkpoint-1700"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

# Специальные токены из конфигурации
SPECIAL_TOKENS = ["<simple_talk>", "</simple_talk>"]

print(f"Используется устройство: {DEVICE}")


Используется устройство: cuda:0


In [3]:
# Загрузка и настройка токенайзера
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

# Добавление специальных токенов
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})

print(f"Размер словаря токенайзера: {len(tokenizer)}")
print(f"Специальные токены: {SPECIAL_TOKENS}")
print(f"ID специальных токенов: {[tokenizer.convert_tokens_to_ids(token) for token in SPECIAL_TOKENS]}")


Размер словаря токенайзера: 128258
Специальные токены: ['<simple_talk>', '</simple_talk>']
ID специальных токенов: [128256, 128257]


In [4]:
# Загрузка базовой модели
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map=DEVICE,
    attn_implementation="flash_attention_2",
)

base_model.resize_token_embeddings(len(tokenizer))

print(f"Базовая модель загружена")
print(f"Размер словаря модели: {base_model.config.vocab_size}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Базовая модель загружена
Размер словаря модели: 128258


In [5]:
# Загрузка модели с LoRA адаптерами
model = PeftModel.from_pretrained(base_model, CHECKPOINT_PATH)

# Переключение в режим инференса
model.eval()

print("LoRA адаптеры загружены успешно")
print(f"Модель в режиме: {'eval' if not model.training else 'train'}")


LoRA адаптеры загружены успешно
Модель в режиме: eval


In [28]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128258, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Line

In [9]:
from datasets import load_dataset

In [10]:
dataset = load_dataset("ExplosionNuclear/ExpNew7")

In [54]:
Companies benefit from access to a wider talent pool, as geographical limitations are removed, and can also experience reduced overhead costs associated with office space. A distributed workforce can also lead to enhanced productivity, as employees may find fewer distractions in their home environments. Furthermore, remote work fosters greater autonomy, empowering individuals to manage their schedules more effectively. Overall, it's a transformative approach to work that benefits many.

In [62]:
def generate_response(prompt, max_length=2000, temperature=0, top_p=0.9):
   
    
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    
    # Генерация
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            top_p=top_p,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Декодирование результата
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Удаление исходного промпта из результата
    response = generated_text[len(prompt):].strip()
    
    return response

print("Функция генерации готова")


Функция генерации готова


In [66]:

q = 14707
prompt = dataset['train'][q]['question']

In [67]:
response = generate_response(prompt, max_length=1200)
print(response)


. 

Your answer: 7<|eot_id|>


## dataset['train'][q]['final_answer']

In [None]:
# Пример с использованием специальных токенов
math_prompt = "<simple_talk>Объясните простыми словами, как решить квадратное уравнение.</simple_talk>"

print(f"Промпт с спец. токенами: {math_prompt}")
print("\nОтвет модели:")
response = generate_response(math_prompt, max_length=200)
print(response)


In [None]:
# Информация о конфигурации модели
print("=== Информация о модели ===")
print(f"Архитектура: {model.config.architectures}")
print(f"Размер скрытого слоя: {model.config.hidden_size}")
print(f"Количество слоев: {model.config.num_hidden_layers}")
print(f"Количество attention головок: {model.config.num_attention_heads}")
print(f"Размер словаря: {model.config.vocab_size}")
print(f"Максимальная длина позиции: {model.config.max_position_embeddings}")

# Информация о LoRA
print("\n=== Информация о LoRA ===")
peft_config = model.peft_config['default']
print(f"LoRA rank (r): {peft_config.r}")
print(f"LoRA alpha: {peft_config.lora_alpha}")
print(f"LoRA dropout: {peft_config.lora_dropout}")
print(f"Целевые модули: {peft_config.target_modules}")


In [6]:
from torch.utils.data import DataLoader
import os
from typing import List, Dict, Any
import torch

In [7]:
def data_calibration_collate(features: List[Dict[str, Any]]) -> Dict[str, Any | torch.Tensor]:
        
        """
        Collate function for the calibration dataset.
        Processes questions and answers. 
        """
        
        if not features:
            return {}

        full_texts = [feature["question"] + feature["answer"] for feature in features]
        batch = self.tokenizer(full_texts, padding=True, return_tensors="pt")
        labels = batch["input_ids"].clone()  # type: ignore
        
        questions = [feature["question"] for feature in features]
        tokenized_questions = self.tokenizer(questions)
        q_lengths = [len(row) for row in tokenized_questions['input_ids']]  # type: ignore

        for i in range(len(q_lengths)):
            labels[i, :q_lengths[i]] = -100
        
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        batch["labels"] = labels
        batch_size = batch["input_ids"].shape[0]
        
        return {
            "input_ids": batch["input_ids"], 
            "labels": batch["labels"], 
            "attention_mask": batch["attention_mask"],
            "source_label": torch.ones(batch_size, dtype=torch.long)
         }

In [8]:
from datasets import load_dataset
data = load_dataset("ExplosionNuclear/new_simple_talks")

README.md:   0%|          | 0.00/336 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.81M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17089 [00:00<?, ? examples/s]

In [15]:
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 17089
    })
})

In [12]:
loader_main = DataLoader(
            data['train'],
            batch_size = 16,
            shuffle = False,
            collate_fn = data_calibration_collate
        )

In [16]:
1068*16

17088

In [13]:
len(loader_main)

1069

In [17]:
for i, inputs in enumerate(loader_main):
    if i < 10:
        print(inputs)
    else:
        break

NameError: name 'self' is not defined