# Fine-tuning a model

## 0. Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from pathlib import Path
import sys

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
print(os.environ.get("CUDA_VISIBLE_DEVICES"))

0,1,2,3


In [6]:
import torch
print(torch.cuda.device_count())
print([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())])

4
['NVIDIA L40S', 'NVIDIA L40S', 'NVIDIA A100 80GB PCIe', 'NVIDIA A100 80GB PCIe']


## 1. Fine-tuning LLaMa (with LoRA)

In [7]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
from datasets import Dataset

from utils.prompt_creation_helpers import create_guevara_prompt  # helper
from transformers import logging as hf_logging

hf_logging.set_verbosity_info()

# === Configuration ===
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
CACHE_DIR = "/data/resource/huggingface/hub"
CSV_PATH = "../data/raw/Annotated-MIMIC-III-Data/ManuallyAnnotatedSyntheticSentences.csv"
OUTPUT_DIR = "./outputs/llama3_lora_guevara"

# === Load tokenizer & model (full precision, no quantization) ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    cache_dir=CACHE_DIR,
    torch_dtype=torch.float16,
   device_map="auto"
)

# === Add LoRA ===
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# === Load and prepare dataset ===
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["sentence", "label"])  # Ensure clean input
dataset = Dataset.from_pandas(df)

def tokenize(example):
    prompt = create_guevara_prompt(example["sentence"], tokenizer)
    model_inputs = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(example["label"], truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize)

# === Training Arguments ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    fp16=True,                
    save_total_limit=2,
    logging_steps=10,
    save_steps=200,
    report_to="none"
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

# === Train ===
trainer.train()

# === Save the LoRA adapter ===
model.save_pretrained(f"{OUTPUT_DIR}/lora_adapter_only")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/tokenizer")

loading file tokenizer.json from cache at /data/resource/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/5f0b02c75b57c5855da9ae460ce51323ea669d8a/tokenizer.json
loading file tokenizer.model from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /data/resource/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/5f0b02c75b57c5855da9ae460ce51323ea669d8a/special_tokens_map.json
loading file tokenizer_config.json from cache at /data/resource/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/5f0b02c75b57c5855da9ae460ce51323ea669d8a/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file config.json from cache at /data/resource/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/5f0b02c75

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /data/resource/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/5f0b02c75b57c5855da9ae460ce51323ea669d8a/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128009
  ],
  "max_length": 4096,
  "temperature": 0.6,
  "top_p": 0.9
}

/tmp/tmp8wsqnnpv/main.c:1:10: fatal error: cuda.h: No such file or directory
    1 | #include "cuda.h"
      |          ^~~~~~~~
compilation terminated.


CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmp8wsqnnpv/main.c', '-O3', '-shared', '-fPIC', '-Wno-psabi', '-o', '/tmp/tmp8wsqnnpv/cuda_utils.cpython-310-x86_64-linux-gnu.so', '-lcuda', '-L/data/resource/anaconda/envs/keble8263-sdoh-extraction/lib/python3.10/site-packages/triton/backends/nvidia/lib', '-L/lib/x86_64-linux-gnu', '-I/data/resource/anaconda/envs/keble8263-sdoh-extraction/lib/python3.10/site-packages/triton/backends/nvidia/include', '-I/tmp/tmp8wsqnnpv', '-I/opt/anaconda/envs/keble8263-sdoh-extraction/include/python3.10', '-I/opt/anaconda/envs/keble8263-sdoh-extraction/targets/x86_64-linux/include']' returned non-zero exit status 1.