In [None]:
! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
! pip install transformers
! pip install unsloth
! pip install trl==0.14.0

! pip install --upgrade unsloth

In [None]:
from unsloth import FastLanguageModel
from datasets import disable_caching
disable_caching()

import torch
torch.cuda.empty_cache()
from datasets import load_dataset
from transformers import TrainingArguments,AutoTokenizer,AutoModelForCausalLM, Trainer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [None]:
MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
DATASET_NAME = "AlgorithmicResearchGroup/ArXivDLInstruct"
SEED = 42
MAX_SEQ_LENGTH = 2048
SUBSET_SIZE = 1000 # 1000, 1500, 2000, 5000, 10K
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

In [None]:
dataset = load_dataset(DATASET_NAME, split='train')
filtered_dataset = dataset.filter(lambda example: len(example["function"]) <= 1000)

train_test_split = filtered_dataset.train_test_split(test_size=0.2, seed=SEED)
train_data, test_data = train_test_split["train"], train_test_split["test"]

train_data = train_data.shuffle(seed=SEED).select(range(SUBSET_SIZE))

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,  
    use_gradient_checkpointing="unsloth"
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=SEED,
    max_seq_length=MAX_SEQ_LENGTH,
)

In [None]:
EOS_TOKEN = tokenizer.eos_token
def format_prompt(example):
    prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction: {example['prompt']}\n### Response: {example['function']}" + EOS_TOKEN
    return {"text": prompt}

formatted_train_data = train_data.map(format_prompt, num_proc=None, keep_in_memory=False)

# Tokenize the dataset (Add this step)
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=1024)

formatted_train_data = formatted_train_data.map(tokenize_function, batched=True)

# Remove unnecessary columns (optional step)
formatted_train_data = formatted_train_data.remove_columns(['text', 'full_code', 'function_name', 'description', 'file', 'extension_type', 'function_summary', 'file_number', 'repo', 'file_length', 'avg_line_length', 'max_line_length'])

# Create the DataCollator
response_template = "### Response:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=formatted_train_data,
    tokenizer=tokenizer,
    data_collator=collator,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=-1,
        num_train_epochs=3,
        dataloader_num_workers=0,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        output_dir="outputs",
        run_name=f"llama3_finetune_{SUBSET_SIZE}",
        optim="adamw_8bit",
        seed=3407,
        lr_scheduler_type="linear",
        weight_decay=0.01,
    ),
)

trainer.train()
trainer.save_model(f"llama3_finetune_{SUBSET_SIZE}")

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()

login(token=os.getenv('HUGGINGFACE_TOKEN'))

model.push_to_hub(f"moosejuice13/llama3_finetune_{SUBSET_SIZE}")
tokenizer.push_to_hub(f"moosejuice13/llama3_finetune_{SUBSET_SIZE}")
