In [None]:
#sft dataset download

from huggingface_hub import snapshot_download

snapshot_download(repo_id="deepmind/code_contests", repo_type="dataset")


Fetching 44 files: 100%|██████████| 44/44 [01:20<00:00,  1.84s/it]


'/home/bart/.cache/huggingface/hub/datasets--deepmind--code_contests/snapshots/802411c3010cb00d1b05bad57ca77365a3c699d6'

In [5]:
from transformers import AutoTokenizer
from datasets import load_dataset

dataset = load_dataset("deepmind/code_contests")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3b-Instruct")

def is_python(example):
    return 3 in example["solutions"]["language"]

python_dataset = dataset["train"].filter(is_python)

def preprocessing_sft(example):
    langs = example["solutions"]["language"]
    py_idx = langs.index(3)

    target_code = example["solutions"]["solution"][py_idx]
    messages = [
        {"role": "system", "content": "You are a competitive programming expert."},
        {"role": "user", "content": f"Solve this: {example['description']}"},
        {"role": "assistant", "content": target_code}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False)
    return {"text": text}

final_dataset = python_dataset.map(preprocessing_sft)

In [6]:
#dataset check
#print(final_dataset[0]['text'])
print(final_dataset[0].keys())

dict_keys(['name', 'description', 'public_tests', 'private_tests', 'generated_tests', 'source', 'difficulty', 'solutions', 'incorrect_solutions', 'cf_contest_id', 'cf_index', 'cf_points', 'cf_rating', 'cf_tags', 'is_description_translated', 'untranslated_description', 'time_limit', 'memory_limit_bytes', 'input_file', 'output_file', 'text'])


In [None]:
#tests tomorrow!!
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3b-Instruct",
    quantization_config=bnb_config,
    device_map="auto"
)

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)



training_config = SFTConfig(
    output_dir="./qwen_sft_results",
    report_to="wandb",
    logging_steps=10,
    per_device_train_batch_size=1,
    learning_rate=2e-5,
    dataset_text_field="text",
    gradient_accumulation_steps=1,
    max_length=512,
    gradient_checkpointing=True,
    bf16=True,
    fp16=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=final_dataset, 
    args=training_config,
    peft_config=peft_config,
    processing_class=tokenizer    
)

trainer.train()
trainer.save_model("./final_qwen_model")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
10,1.492
20,1.6236
30,1.365
40,1.4806
50,1.3153
60,1.155
70,1.2671
80,1.4022
90,1.2355
100,1.0659
