In [1]:
import torch
import gc

def free_gpu_memory():
    # Clear unused objects by forcing garbage collection
    gc.collect()

    # Empty the CUDA cache to release unused GPU memory
    torch.cuda.empty_cache()

    # Print current GPU memory usage for monitoring
    print(f"Current allocated GPU memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    print(f"Current cached GPU memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

free_gpu_memory()

Current allocated GPU memory: 0.00 MB
Current cached GPU memory: 0.00 MB


In [None]:
!nvidia-smi

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load the Qwen model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load a Question Answering dataset
sft_dataset = load_dataset("kishore-s-15/curriculum_compass_sft_dataset")["train"]

# Preprocess the dataset
def preprocess_function(examples):
    
    def get_inputs(examples):
        inputs = []
        for question, context in zip(examples["question"], examples["context"]):
            inp = f"""
            Query:
            {question}

            Context:
            {context}
            """

            inputs.append(inp)

        return inputs
            
    inputs = get_inputs(examples)
    targets = examples["response"]
    model_inputs = tokenizer(inputs, max_length=4098*2, truncation=True, padding="max_length")

    # Tokenize the targets
    labels = tokenizer(targets, max_length=4098*2, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Tokenize the datasets
tokenized_datasets = sft_dataset.map(preprocess_function, batched=True, remove_columns=["question", "context", "response"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen_qa_finetune",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    # save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=0,
    fp16=True,  # Enable mixed precision training if using GPU
    push_to_hub=False
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./qwen_qa_finetune")
tokenizer.save_pretrained("./qwen_qa_finetune")

  warn(
  trainer = Trainer(
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33ms-kishore[0m. Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 31.74 GiB of which 15.56 MiB is free. Process 128540 has 20.43 GiB memory in use. Including non-PyTorch memory, this process has 11.29 GiB memory in use. Of the allocated memory 10.50 GiB is allocated by PyTorch, and 434.21 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)