In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes trl safetensors ipywidgets huggingface_hub scipy -U

In [None]:
import datasets
import psycopg2
from huggingface_hub import login
from constants import *
from transformers import AutoTokenizer, AutoModelForCausalLM

login('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')

def load_model() -> tuple[AutoModelForCausalLM, AutoTokenizer]:
    model_name = "google/gemma-2b"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [None]:
from datasets import load_dataset
from random import randrange

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer

from constants import *

# The model that you want to train from the Hugging Face hub
model_id = "google/gemma-2b"
# The instruction dataset to use
dataset_name = EDITORIAL_DATASET
# Dataset split
dataset_split = "train"
# Fine-tuned model name
new_model = "gemma-2b-editorial"
# Huggingface repository
hf_model_repo = GENERATOR_MODEL_ID
# Load the entire model on the GPU 0
device_map = "auto"

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_double_nested_quant = False

################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 16
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.05

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = new_model
# Number of training epochs
num_train_epochs = 2
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False
# Batch size per GPU for training
batch_size = 128
per_device_train_batch_size = 1
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = batch_size // per_device_train_batch_size
# Enable gradient checkpointing
gradient_checkpointing = False
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 3e-4 #1e-5
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "constant" # "constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = False
# Save checkpoint every X updates steps
save_steps = 0
# Log every X updates steps
logging_steps = 25
# Disable tqdm
disable_tqdm = True

################################################################################
# SFTTrainer parameters
################################################################################
# Maximum sequence length to use
max_seq_length = 512 #None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = True #True #False

# High-level parameters

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_use_double_quant=use_double_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype
)

peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
        ],
)

# Define the training arguments
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size, # 6 if use_flash_attention else 4,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    optim=optim,
    #save_steps=save_steps,
    logging_steps=logging_steps,
    save_strategy="epoch",
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    #max_steps=max_steps,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    disable_tqdm=disable_tqdm,
    report_to="tensorboard",
    seed=42
)

In [None]:
import datasets
import gc

from inference import load_model
from settings import *
from constants import *

def format_instruction(sample):
    name = sample['name']
    description = sample['description']
    tags = sample['tags']
    editorial = sample['editorial']
    sections = editorial.split('TUTORIAL CODE XXX')
    solution = sections[0].strip()
    code = sections[1].strip()
    return f"""<start_of_turn>user
You are a contestant in a programming contest. You are given the following complicated competitive programming problem:
The problem description is given between the <DESCRIPTION> and </DESCRIPTION> tags.

Here is your task:
- Reason about the problem, write the editorial for the problem, wrapped in the <EDITORIAL> and </EDITORIAL> tags.
- Write a solution in any programming language, preferably in Python/C++, wrapped in the <CODE> and </CODE> tags.

Your entire answer should be wrapped in the <ANSWER> and </ANSWER> tags. Answer in Markdown format.

<DESCRIPTION>
Name: {name}
Tags: {tags}
{description}
</DESCRIPTION><end_of_turn>
<start_of_turn>model
<ANSWER>
<EDITORIAL>
{solution}
</EDITORIAL>
<CODE>
{code}
</CODE>
</ANSWER><end_of_turn>"""

In [None]:
model, tokenizer = load_model()

DATASET_ID = EDITORIAL_DATASET
SPLIT = 'train'
CACHE_DIR = 'cache-editorial'

dataset = datasets.load_dataset(DATASET_ID)

dataset = dataset.filter(lambda x: x['has_code'])

print(format_instruction(dataset['train'][0]))

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=packing,
    formatting_func=format_instruction,
    args=args,
)        

In [None]:
gc.collect()
trainer.train()

In [None]:
gc.collect()
trainer.save_model()