In [None]:
! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
! pip install transformers
! pip install unsloth
! pip install trl==0.14.0

! pip install --upgrade unsloth

In [1]:
from unsloth import FastLanguageModel
from datasets import disable_caching
disable_caching()

import torch
torch.cuda.empty_cache()
from datasets import load_dataset
from transformers import TrainingArguments,AutoTokenizer,AutoModelForCausalLM, Trainer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
DATASET_NAME = "AlgorithmicResearchGroup/ArXivDLInstruct"
SEED = 42
MAX_SEQ_LENGTH = 8192
SUBSET_SIZE = 10000 # 1000, 1500, 2000, 5000, 10K
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

In [3]:
dataset = load_dataset(DATASET_NAME, split='train')
filtered_dataset = dataset.filter(lambda example: len(example["function"]) <= 1000)
filtered_dataset = filtered_dataset.filter(lambda example: example['file_length'] <= 5000) ### remove long tail in file lengths

train_test_split = filtered_dataset.train_test_split(test_size=0.2, seed=SEED)
train_data, test_data = train_test_split["train"], train_test_split["test"]

train_data = train_data.shuffle(seed=SEED).select(range(SUBSET_SIZE))

Filter: 100%|██████████| 778152/778152 [00:19<00:00, 39890.00 examples/s]


In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,  
    use_gradient_checkpointing="unsloth"
)

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    NVIDIA GeForce RTX 3080. Num GPUs = 1. Max memory: 10.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=SEED,
    max_seq_length=MAX_SEQ_LENGTH,
)

Unsloth 2025.3.18 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
EOS_TOKEN = tokenizer.eos_token
def format_prompt(example):
    prompt = f"Below is an instruction that describes a programming task, paired with an input that contains existing code. Write a response that appropriately completes the request.\n\n### Instruction: {example['prompt']}\n\n### Input: {example['full_code'].replace(example['function'], '')}\n\n### Response: {example['function']}" + EOS_TOKEN
    return {"text": prompt}

formatted_train_data = train_data.map(format_prompt, num_proc=None, keep_in_memory=False)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=2048)

formatted_train_data = formatted_train_data.map(tokenize_function, batched=True)

# Remove unnecessary columns
formatted_train_data = formatted_train_data.remove_columns(['text', 'full_code', 'function_name', 'description', 'file', 'extension_type', 'function_summary', 'file_number', 'repo', 'file_length', 'avg_line_length', 'max_line_length'])

# Create the DataCollator
response_template = "### Response:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

Map: 100%|██████████| 10000/10000 [00:01<00:00, 6593.60 examples/s]
Map: 100%|██████████| 10000/10000 [00:03<00:00, 3304.13 examples/s]


In [7]:
trainer = Trainer(
    model=model,
    train_dataset=formatted_train_data,
    tokenizer=tokenizer,
    data_collator=collator,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=-1,
        num_train_epochs=3,
        dataloader_num_workers=0,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        output_dir="outputs",
        run_name=f"llama3_finetune_{SUBSET_SIZE}",
        optim="adamw_8bit",
        seed=3407,
        lr_scheduler_type="linear",
        weight_decay=0.01,
    ),
)

trainer.train()
trainer.save_model(f"llama3_finetune_{SUBSET_SIZE}")

  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 3 | Total steps = 7,500
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.1211
2,0.9304
3,1.6765
4,1.2075
5,0.9392
6,1.499
7,0.7292
8,0.7756
9,1.5668
10,1.1113




In [None]:
from huggingface_hub import login

api_token = 'TOKEN'
login(token=api_token)

model.push_to_hub(f"moosejuice13/llama3_finetune_{SUBSET_SIZE}")
tokenizer.push_to_hub(f"moosejuice13/llama3_finetune_{SUBSET_SIZE}")


100%|██████████| 1/1 [00:05<00:00,  5.18s/it]


Saved model to https://huggingface.co/moosejuice13/llama3_finetune_10000


100%|██████████| 1/1 [00:02<00:00,  2.04s/it]
