# LLaMA-8b-Instruct Fine-tuning for SDoH Classification

## 0. Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import sys

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

In [3]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # A100 (nvtop Device 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Use nvtop Device 1 (A100)

In [4]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
import pandas as pd

LLAMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
CACHE_DIR = "/data/resource/huggingface/hub"
MODEL_OUTPUT_DIR = "../results/model_training/llama_lora_binary_sdoh"



## 1. Load and prepare data

In [5]:
from src.classification.prompt_creation_helpers import create_automated_prompt

train_df = pd.read_csv("../data/processed/train-test/train_set.csv")
test_df = pd.read_csv("../data/processed/train-test/test_set.csv")

def make_prompt(row):
    return create_automated_prompt(
        sentence=row["Sentence"],
        labels=row["completion"],
        task_type="sdoh_detection"
    )

train_df["text"] = train_df.apply(make_prompt, axis=1)
test_df["text"] = test_df.apply(make_prompt, axis=1)

train_dataset = Dataset.from_pandas(train_df[["text"]])
test_dataset = Dataset.from_pandas(test_df[["text"]])


## 2. Load tokenizer & model in 4-bit & apply LoRA

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_NAME, cache_dir=CACHE_DIR, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # LLaMA doesn't have a pad token

model = AutoModelForCausalLM.from_pretrained(
    LLAMA_MODEL_NAME,
    cache_dir=CACHE_DIR,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


In [None]:
print("Using device:", torch.cuda.current_device(), "-", torch.cuda.get_device_name(torch.cuda.current_device()))

Using device: 0 - NVIDIA A100 80GB PCIe


## 3. Tokenise

In [None]:
def tokenize(example):
    encoding = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

## 4. Training

In [None]:
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.cuda.device_count():", torch.cuda.device_count())

for i in range(torch.cuda.device_count()):
    print(f"cuda:{i} -> {torch.cuda.get_device_name(i)}")

CUDA_VISIBLE_DEVICES: 3
torch.cuda.device_count(): 1
cuda:0 -> NVIDIA A100 80GB PCIe


In [None]:
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    label_names=["labels"],
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    logging_dir=os.path.join(MODEL_OUTPUT_DIR, "logs"),
    save_total_limit=1,
    load_best_model_at_end=True,
    bf16=True,  # or fp16 if not on A100
    logging_steps=10,
    report_to=[],
    run_name="llama3_lora_sdoh",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4654,0.390792
2,0.4101,0.366949



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct to ask for access. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct to ask for access. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.


TrainOutput(global_step=282, training_loss=0.42493336526214653, metrics={'train_runtime': 253.2473, 'train_samples_per_second': 4.454, 'train_steps_per_second': 1.114, 'total_flos': 2.6029803077369856e+16, 'train_loss': 0.42493336526214653, 'epoch': 2.0})

In [None]:
trainer.save_model(MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)
print(f"✅ Model saved to {MODEL_OUTPUT_DIR}")


Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct to ask for access. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.


✅ Model saved to ../results/model_training/llama_lora_binary_sdoh


## 5. Evaluation

In [None]:
# Paths and config
MODEL_OUTPUT_DIR = "../results/llama_lora_sdoh_detection"
TEST_CSV = "../data/processed/train-test/test_set.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_DIR)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_OUTPUT_DIR, device_map="auto")
model.eval()

# Load test data
df = pd.read_csv(TEST_CSV)

# Limit to 5 samples
df_sample = df.sample(n=5, random_state=42).reset_index(drop=True)

# Generate prompt
df_sample["prompt"] = df_sample.apply(
    lambda row: create_automated_prompt(
        sentence=row["Sentence"],
        labels=row["completion"],
        task_type="sdoh_detection"
    ), axis=1
)

# Generate responses
def generate_response(prompt, max_new_tokens=64):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded

# Extract just <LIST>...</LIST>
def extract_list_output(output_text):
    start = output_text.find("<LIST>")
    end = output_text.find("</LIST>")
    if start != -1 and end != -1:
        return output_text[start:end+7]
    return "NO_LIST_FOUND"

# Run generation
df_sample["generated_completion"] = df_sample["prompt"].apply(lambda x: extract_list_output(generate_response(x)))

# Display results
for i, row in df_sample.iterrows():
    print(f"🔢 Example {i+1}")
    print("📝 Sentence:", row["Sentence"])
    print("✅ True label:", row["completion"])
    print("📤 Prompt:\n", row["prompt"])
    print("🤖 Generated:\n", row["generated_completion"])
    print("-" * 80)

In [None]:
from tqdm import tqdm
from transformers import TextStreamer, BitsAndBytesConfig

# ================
# 🔧 Setup
# ================
MODEL_OUTPUT_DIR = "../results/model_training/llama_lora_binary_sdoh"
CACHE_DIR = "/data/resource/huggingface/hub"
TEST_CSV = "../data/processed/train-test/test_set.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LLAMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

# ================
# 📥 Load model in 4-bit
# ================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_DIR, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_OUTPUT_DIR,
    cache_dir=CACHE_DIR,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.eval()

# ================
# 📊 Load test set
# ================
df = pd.read_csv(TEST_CSV)

# ================
# 📜 Generate prompts
# ================
def make_prompt(sentence, labels=None):
    return create_automated_prompt(sentence=sentence, labels=labels, task_type="sdoh_detection")

df["prompt"] = df.apply(lambda row: make_prompt(row["Sentence"], row["completion"]), axis=1)

# ================
# 🔮 Generate predictions
# ================
def generate_response(prompt, max_new_tokens=64):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded

def extract_list_output(output_text):
    # Extracts content between <LIST>...</LIST>
    start = output_text.find("<LIST>")
    end = output_text.find("</LIST>")
    if start != -1 and end != -1:
        return output_text[start:end+7]
    return "NO_LIST_FOUND"

# tqdm progress bar for batch generation
outputs = []
for prompt in tqdm(df["prompt"], desc="Generating predictions"):
    full_output = generate_response(prompt)
    outputs.append(extract_list_output(full_output))

df["generated_completion"] = outputs

# ================
# 🧮 Evaluation (basic)
# ================
from sklearn.metrics import classification_report

y_true = df["completion"].apply(lambda x: "NoSDoH" if "NoSDoH" in x else "AnySDoH")
y_pred = df["generated_completion"].apply(lambda x: "NoSDoH" if "NoSDoH" in x else "AnySDoH")

print("\n📊 Classification Report (binary presence):")
print(classification_report(y_true, y_pred, target_names=["NoSDoH", "Any SDoH"]))

# ================
# 💾 Save results
# ================
df[["Sentence", "completion", "generated_completion"]].to_csv(
    os.path.join(MODEL_OUTPUT_DIR, "eval_predictions.csv"),
    index=False
)
print(f"\n✅ Predictions saved to {MODEL_OUTPUT_DIR}/eval_predictions.csv")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating predictions:   0%|          | 0/243 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   0%|          | 1/243 [00:10<41:29, 10.29s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   1%|          | 2/243 [00:14<27:52,  6.94s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   1%|          | 3/243 [00:19<23:27,  5.87s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   2%|▏         | 4/243 [00:24<21:18,  5.35s/it]The following generation flags are not valid and may be ignored: ['temperature', 


📊 Classification Report (binary presence):
              precision    recall  f1-score   support

      NoSDoH       0.41      1.00      0.58        99
    Any SDoH       0.00      0.00      0.00       144

    accuracy                           0.41       243
   macro avg       0.20      0.50      0.29       243
weighted avg       0.17      0.41      0.24       243


✅ Predictions saved to ../results/model_training/llama_lora_binary_sdoh/eval_predictions.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
