In [1]:
import torch
import gc

def free_gpu_memory():
    # Clear unused objects by forcing garbage collection
    gc.collect()

    # Empty the CUDA cache to release unused GPU memory
    torch.cuda.empty_cache()

    # Print current GPU memory usage for monitoring
    print(f"Current allocated GPU memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    print(f"Current cached GPU memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

free_gpu_memory()

Current allocated GPU memory: 0.00 MB
Current cached GPU memory: 0.00 MB


In [2]:
!nvidia-smi

Sun Dec 29 21:45:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:86:00.0 Off |                    0 |
| N/A   36C    P0              56W / 300W |  32485MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch
from transformers import BitsAndBytesConfig

# Load the Qwen model and tokenizer with quantization
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# Define quantization config (4-bit quantization)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model with quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Define LoRA Config
lora_config = LoraConfig(
    r=8,  # rank
    lora_alpha=32,  # alpha scaling
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # which modules to apply LoRA to
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Load dataset
sft_dataset = load_dataset("kishore-s-15/curriculum_compass_sft_dataset")["train"]

# Preprocess function remains the same
def preprocess_function(examples):
    def get_inputs(examples):
        inputs = []
        for question, context in zip(examples["question"], examples["context"]):
            inp = f"""
            Query:
            {question}
            Context:
            {context}
            """
            inputs.append(inp)
        return inputs
            
    inputs = get_inputs(examples)
    targets = examples["response"]
    model_inputs = tokenizer(inputs, max_length=4098*2, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=4098*2, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Tokenize the datasets
tokenized_datasets = sft_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=["question", "context", "response"]
)

# Define training arguments - modified for LoRA training
training_args = TrainingArguments(
    output_dir="./qwen_qa_lora",
    evaluation_strategy="epoch",
    learning_rate=2e-4,  # Higher learning rate for LoRA
    per_device_train_batch_size=4,  # Can be higher due to parameter efficiency
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=4
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the LoRA adapter weights
model.save_pretrained("./qwen_qa_lora")
tokenizer.save_pretrained("./qwen_qa_lora")

  warn(


False

The following directories listed in your path were found to be non-existent: {PosixPath('/shared/centos7/cuda/11.2/doc/man')}
The following directories listed in your path were found to be non-existent: {PosixPath('sys/dashboard/sys/jupyterlab')}
The following directories listed in your path were found to be non-existent: {PosixPath('/usr/lib64/qt-3.3/include')}
The following directories listed in your path were found to be non-existent: {PosixPath('3128'), PosixPath('http'), PosixPath('//10.99.0.130')}
The following directories listed in your path were found to be non-existent: {PosixPath('3128'), PosixPath('http'), PosixPath('//10.99.0.130')}
The following directories listed in your path were found to be non-existent: {PosixPath('nodejs/14.15.4'), PosixPath('discovery/2021-10-06'), PosixPath('cuda/11.2')}
The following directories listed in your path were found to be non-existent: {PosixPath('/var/www/ood/apps/sys/dashboard')}
The following directories listed in your path were


python -m bitsandbytes


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


RuntimeError: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):

        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues