In [None]:
#@title Скачиваем и устанавливаем зависимости

from pathlib import Path

!git clone https://github.com/IlyaGusev/rulm.git
!git clone https://github.com/ggerganov/llama.cpp.git

self_instruct_dir = Path('rulm/self_instruct').resolve()

!cd {self_instruct_dir} && pip install -r ../requirements.txt

In [None]:
#@title Логинимся в WandB

import wandb
wandb.login()

In [None]:
#@title Генерируем обучающую и валидационную выборку, обрезая её для Colab

content_dir = Path('.').resolve()
!cd {self_instruct_dir} && python -m src.data_processing.create_short_chat_set \
    {content_dir / 'train_full.jsonl'} \
    {content_dir / 'val_full.jsonl'}

assert (content_dir / 'train_full.jsonl').exists()

train_size_limit = 400 #@param {type:"integer"}
val_size_limit = 200 #@param {type:"integer"}

!head -n {train_size_limit} {content_dir / 'train_full.jsonl'} > {content_dir / 'train.jsonl'}
!head -n {val_size_limit} {content_dir / 'val_full.jsonl'} > {content_dir / 'val.jsonl'}

In [None]:
#@title Скачиваем базовую модель LLaMA2 7B и чиним её конфиг

import json
from huggingface_hub import snapshot_download

model_dir = content_dir / "llama2-7b"
base_model = "TheBloke/Llama-2-7B-fp16" #@param {type:"string"}
snapshot_download(repo_id=base_model, local_dir=model_dir, ignore_patterns=["LICENSE", "README.md", ".gitattributes"])

patch_model_config = True #@param {type:"boolean"}

if patch_model_config:
    replacements = {
        "tokenizer_config.json": {
            "tokenizer_class": "LlamaTokenizer",
            "model_max_length": 4096,
            "padding_side": "left",
            "bos_token": "<s>",
            "eos_token": "</s>",
            "unk_token": "<unk>",
            "clean_up_tokenization_spaces": False,
            "special_tokens_map_file": "special_tokens_map.json",
        },
        "special_tokens_map.json": {
            "bos_token": "<s>",
            "eos_token": "</s>",
            "pad_token": "<unk>",
            "sep_token": "<s>",
            "unk_token": "<unk>",
        }
    }

    print('Patching model config...')
    for filename, new_content in replacements.items():
        print(f'{filename}:')
        with (model_dir / filename).open() as fp:
            old_content = json.load(fp)
            print(f'    Original content: {old_content}')
            if old_content == new_content:
                print('    Already patched, skipping')
        print(f'    Updated content:  {new_content}')
        with (model_dir / filename).open('w') as fp:
            json.dump(new_content, fp, indent=4)

In [None]:
#@title Уменьшаем размер батча и лимит токенов, чтобы поместиться в Colab, и длительность обучения для демки

original_config_path = self_instruct_dir / 'configs/saiga2_7b.json'

with original_config_path.open('r') as fp:
    config = json.load(fp)

# Colab adjustments
config['trainer']['per_device_train_batch_size'] = 2 #@param {type:"integer"}
config['trainer']['gradient_accumulation_steps'] = 64 #@param {type:"integer"}
config['max_tokens_count'] = 1024 #@param {type:"integer"}
config['model_name'] = str(model_dir)

# Demo adjustments
config['trainer']['eval_steps'] = 2 #@param {type:"integer"}
config['trainer']['logging_steps'] = 1 #@param {type:"integer"}
config['trainer']['num_train_epochs'] = 1 #@param {type:"integer"}

config_path = self_instruct_dir / 'configs/saiga2_7b_colab.json'

with config_path.open('w') as fp:
    json.dump(config, fp, indent=4)

In [None]:
#@title Запускаем обучение!

output_dir = content_dir / 'output'

!cd {self_instruct_dir} && python3 -m src.train \
    --config-file {config_path.relative_to(self_instruct_dir)} \
    --train-file {content_dir / 'train.jsonl'} \
    --val-file {content_dir / 'val.jsonl'} \
    --output-dir {output_dir}

assert (output_dir / 'adapter_config.json').exists()

In [None]:
#@title Исправляем конфиг инференса обученной модели

with (output_dir / 'generation_config.json').open('w') as fp:
    json.dump({
        "pad_token_id": 0,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "temperature": 0.2,
        "top_p": 0.9,
        "top_k": 40,
        "do_sample": True,
        "max_new_tokens": 1536,
        "repetition_penalty": 1.1,
        "no_repeat_ngram_size": 15,
    }, fp, indent=4)

In [None]:
#@title Склеиваем вместе обученные адаптеры с базовой моделью, сохраняем результат в формат PyTorch

merged_model_name = 'merged.pt'

!cd {self_instruct_dir} && python -m src.convert_to_native \
    {output_dir} \
    {output_dir / merged_model_name} \
    --device=cuda \
    --enable_offloading

assert (output_dir / merged_model_name).exists()

In [None]:
#@title Конвертируем склеенную модель в 16-битный формат GGML (llama.cpp)

ggml_f16_model_name = 'model-f16.gguf'

!cd llama.cpp && python convert.py \
    {output_dir / merged_model_name} \
    --vocab-dir {output_dir} \
    --outfile {output_dir / ggml_f16_model_name} \
    --outtype f16

assert (output_dir / ggml_f16_model_name).exists()

In [None]:
#@title Квантизуем результат в 4 бита

quantization_type = "q4_0" #@param ["q4_0", "q4_1"] {allow-input: true}
ggml_quantized_model_name = f'model-{quantization_type}.gguf'

!cd llama.cpp && make quantize && ./quantize \
    {output_dir / ggml_f16_model_name} \
    {output_dir / ggml_quantized_model_name} \
    {quantization_type}

assert (output_dir / ggml_quantized_model_name).exists()

In [None]:
#@title Прогоняем полученную модель на нескольких диалогах из валидационной выборки

num_test_samples = 3 #@param {type:"integer"}
max_new_tokens = 20 #@param {type:"integer"}

!head -n {num_test_samples} {content_dir / 'val.jsonl'} > {content_dir / 'test.jsonl'}

!cd {self_instruct_dir} && python -m src.infer_saiga_llamacpp \
    --model_name={output_dir / ggml_quantized_model_name} \
    --input_path={content_dir / 'test.jsonl'} \
    --output_path={content_dir / 'test_result.jsonl'} \
    --max_new_tokens={max_new_tokens}

assert (content_dir / 'test_result.jsonl').exists()