# LLaMA-8b-Instruct Fine-tuning for SDoH Classification

## 0. Setup

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import os
from pathlib import Path
import sys

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

In [19]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # A100 (nvtop Device 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Use nvtop Device 1 (A100)

In [20]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
import pandas as pd

LLAMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
CACHE_DIR = "/data/resource/huggingface/hub"
MODEL_OUTPUT_DIR = "../results/model_training/llama_lora_binary_sdoh"

## 1. Load and prepare data

In [40]:
from src.classification.prompt_creation_helpers import create_automated_prompt

# Load data
train_df = pd.read_csv("../data/processed/train-test/train_set.csv")
test_df = pd.read_csv("../data/processed/train-test/test_set.csv")

# Map completions to binary
binary_label = lambda c: "<LIST>NoSDoH</LIST>" if "NoSDoH" in c else "<LIST>SDoH</LIST>"
train_df["completion"] = train_df["completion"].map(binary_label)
test_df["completion"] = test_df["completion"].map(binary_label)

# Create prompts
train_df["text"] = train_df.apply(lambda row: create_automated_prompt(row["Sentence"], row["completion"], task_type="sdoh_detection"), axis=1)
test_df["text"] = test_df.apply(lambda row: create_automated_prompt(row["Sentence"], row["completion"], task_type="sdoh_detection"), axis=1)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[["text", "completion"]])
test_dataset = Dataset.from_pandas(test_df[["text", "completion"]])

In [41]:
train_df["completion"].value_counts()

completion
<LIST>NoSDoH</LIST>    337
<LIST>SDoH</LIST>      227
Name: count, dtype: int64

## 2. Load tokenizer & model in 4-bit & apply LoRA

In [23]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_NAME, cache_dir=CACHE_DIR, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # LLaMA doesn't have a pad token

model = AutoModelForCausalLM.from_pretrained(
    LLAMA_MODEL_NAME,
    cache_dir=CACHE_DIR,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [24]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


In [25]:
print("Using device:", torch.cuda.current_device(), "-", torch.cuda.get_device_name(torch.cuda.current_device()))

Using device: 0 - NVIDIA A100 80GB PCIe


## 3. Tokenise

In [26]:
def tokenize(example):
    encoding = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

## 4. Training

In [27]:
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.cuda.device_count():", torch.cuda.device_count())

for i in range(torch.cuda.device_count()):
    print(f"cuda:{i} -> {torch.cuda.get_device_name(i)}")

CUDA_VISIBLE_DEVICES: 3
torch.cuda.device_count(): 1
cuda:0 -> NVIDIA A100 80GB PCIe


In [28]:
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    label_names=["labels"],
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    logging_dir=os.path.join(MODEL_OUTPUT_DIR, "logs"),
    save_total_limit=1,
    load_best_model_at_end=True,
    bf16=True,  # or fp16 if not on A100
    logging_steps=10,
    report_to=[],
    run_name="llama3_lora_sdoh",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4686,0.393968
2,0.411,0.368052



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct to ask for access. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct to ask for access. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.


TrainOutput(global_step=282, training_loss=0.426088458257364, metrics={'train_runtime': 251.5888, 'train_samples_per_second': 4.484, 'train_steps_per_second': 1.121, 'total_flos': 2.6029803077369856e+16, 'train_loss': 0.426088458257364, 'epoch': 2.0})

In [30]:
trainer.save_model(MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)
print(f"✅ Model saved to {MODEL_OUTPUT_DIR}")


Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct to ask for access. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.


✅ Model saved to ../results/model_training/llama_lora_binary_sdoh


## 5. Evaluation

### Baseline with few-shot

In [46]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_NAME, cache_dir=CACHE_DIR, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    LLAMA_MODEL_NAME,
    cache_dir=CACHE_DIR,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [44]:
def run_prompt(prompt: str, model, tokenizer, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

In [48]:
from sklearn.metrics import classification_report
from tqdm import tqdm

# For safety, ensure test_df["text"] and test_df["completion"] are in place
assert "text" in test_df.columns and "completion" in test_df.columns

def extract_prediction(response: str) -> str:
    """Extracts the first well-formed <LIST>...</LIST> tag from the model output."""
    if "<LIST>" in response and "</LIST>" in response:
        start = response.index("<LIST>")
        end = response.index("</LIST>") + len("</LIST>")
        return response[start:end]
    return "<LIST>UNKNOWN</LIST>"

gold = []
preds = []

print("🔍 Running inference on full test set...")
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    prompt = row["text"]
    expected = row["completion"]

    output = run_prompt(prompt, model, tokenizer, max_new_tokens=20)
    prediction = extract_prediction(output)

    gold.append(expected)
    preds.append(prediction)

# 📊 Report
print("\n📊 Classification Report (binary presence):")
print(classification_report(
    gold, preds,
    labels=["<LIST>NoSDoH</LIST>", "<LIST>SDoH</LIST>"],
    digits=2
))

🔍 Running inference on full test set...


  0%|          | 0/243 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  0%|          | 1/243 [00:00<02:52,  1.40it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  1%|          | 2/243 [00:01<02:58,  1.35it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  1%|          | 3/243 [00:02<02:47,  1.44it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  2%|▏         | 4/243 [00:02<02:41,  1.48it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  2%|▏         | 5/243 [00:03<02:37,  1.51it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  2%|▏         | 6/243 [00:03<02:27,  1.60it/s]The


📊 Classification Report (binary presence):
                     precision    recall  f1-score   support

<LIST>NoSDoH</LIST>       0.89      0.74      0.81       144
  <LIST>SDoH</LIST>       0.70      0.87      0.77        99

          micro avg       0.79      0.79      0.79       243
          macro avg       0.79      0.80      0.79       243
       weighted avg       0.81      0.79      0.79       243






### Fine-tuned (test on a sample)

In [49]:
# Paths and config
MODEL_OUTPUT_DIR = "../results/model_training/llama_lora_binary_sdoh"
TEST_CSV = "../data/processed/train-test/test_set.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_DIR)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_OUTPUT_DIR, device_map="auto")
model.eval()

# === Load first 10 test rows ===
df_sample = pd.read_csv(TEST_CSV).head(10).reset_index(drop=True)

# Generate prompt
df_sample["prompt"] = df_sample.apply(
    lambda row: create_automated_prompt(
        sentence=row["Sentence"],
        labels=row["completion"],
        task_type="sdoh_detection"
    ), axis=1
)

# Generate responses
def generate_response(prompt, max_new_tokens=64):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    input_len = inputs["input_ids"].shape[1]
    decoded = tokenizer.decode(output[0][input_len:], skip_special_tokens=True)
    return decoded.strip()

# Extract just <LIST>...</LIST>
def extract_list_output(output_text):
    start = output_text.find("<LIST>")
    end = output_text.find("</LIST>")
    if start != -1 and end != -1:
        return output_text[start:end+7]
    return "NO_LIST_FOUND"

# Run generation
df_sample["generated_completion"] = df_sample["prompt"].apply(lambda x: extract_list_output(generate_response(x)))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

In [51]:
# Run generation on full test set
predictions = []
for prompt in tqdm(test_df["text"], desc="Generating predictions"):
    output = generate_response(prompt)
    prediction = extract_list_output(output)
    predictions.append(prediction)

test_df["generated_completion"] = predictions

# Evaluation
print("\n📊 Classification Report:")
print(classification_report(
    test_df["completion"],
    test_df["generated_completion"],
    labels=["<LIST>NoSDoH</LIST>", "<LIST>SDoH</LIST>"],
    digits=2
))

Generating predictions:   0%|          | 0/243 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generating predictions:   0%|          | 1/243 [00:00<02:48,  1.43it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   1%|          | 2/243 [00:01<02:49,  1.42it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   1%|          | 3/243 [00:02<02:42,  1.48it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   2%|▏         | 4/243 [00:02<02:28,  1.61it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   2%|▏         | 5/243 [00:03<02:20,  1.70it/s]The following generation flags are not valid and may be ignored: ['temper


📊 Classification Report:
                     precision    recall  f1-score   support

<LIST>NoSDoH</LIST>       0.91      0.69      0.79       144
  <LIST>SDoH</LIST>       0.67      0.90      0.77        99

           accuracy                           0.78       243
          macro avg       0.79      0.80      0.78       243
       weighted avg       0.81      0.78      0.78       243






In [None]:
# from tqdm import tqdm
# from transformers import TextStreamer, BitsAndBytesConfig

# # ================
# # 🔧 Setup
# # ================
# MODEL_OUTPUT_DIR = "../results/model_training/llama_lora_binary_sdoh"
# CACHE_DIR = "/data/resource/huggingface/hub"
# TEST_CSV = "../data/processed/train-test/test_set.csv"
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# LLAMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

# # ================
# # 📥 Load model in 4-bit
# # ================
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
# )

# tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_DIR, use_fast=True)
# tokenizer.pad_token = tokenizer.eos_token

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_OUTPUT_DIR,
#     cache_dir=CACHE_DIR,
#     device_map="auto",
#     quantization_config=bnb_config,
#     trust_remote_code=True
# )
# model.eval()

# # ================
# # 📊 Load test set
# # ================
# df = pd.read_csv(TEST_CSV)

# # ================
# # 📜 Generate prompts
# # ================
# def make_prompt(sentence, labels=None):
#     return create_automated_prompt(sentence=sentence, labels=labels, task_type="sdoh_detection")

# df["prompt"] = df.apply(lambda row: make_prompt(row["Sentence"], row["completion"]), axis=1)

# # ================
# # 🔮 Generate predictions
# # ================
# def generate_response(prompt, max_new_tokens=64):
#     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
#     with torch.no_grad():
#         output = model.generate(
#             **inputs,
#             max_new_tokens=max_new_tokens,
#             do_sample=False,
#             temperature=0.0,
#             pad_token_id=tokenizer.eos_token_id
#         )
#     decoded = tokenizer.decode(output[0], skip_special_tokens=True)
#     return decoded

# def extract_list_output(output_text):
#     # Extracts content between <LIST>...</LIST>
#     start = output_text.find("<LIST>")
#     end = output_text.find("</LIST>")
#     if start != -1 and end != -1:
#         return output_text[start:end+7]
#     return "NO_LIST_FOUND"

# # tqdm progress bar for batch generation
# outputs = []
# for prompt in tqdm(df["prompt"], desc="Generating predictions"):
#     full_output = generate_response(prompt)
#     outputs.append(extract_list_output(full_output))

# df["generated_completion"] = outputs

# # ================
# # 🧮 Evaluation (basic)
# # ================
# from sklearn.metrics import classification_report

# y_true = df["completion"].apply(lambda x: "NoSDoH" if "NoSDoH" in x else "AnySDoH")
# y_pred = df["generated_completion"].apply(lambda x: "NoSDoH" if "NoSDoH" in x else "AnySDoH")

# print("\n📊 Classification Report (binary presence):")
# print(classification_report(y_true, y_pred, target_names=["NoSDoH", "Any SDoH"]))

# # ================
# # 💾 Save results
# # ================
# df[["Sentence", "completion", "generated_completion"]].to_csv(
#     os.path.join(MODEL_OUTPUT_DIR, "eval_predictions.csv"),
#     index=False
# )
# print(f"\n✅ Predictions saved to {MODEL_OUTPUT_DIR}/eval_predictions.csv")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating predictions:   0%|          | 0/243 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   0%|          | 1/243 [00:10<41:29, 10.29s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   1%|          | 2/243 [00:14<27:52,  6.94s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   1%|          | 3/243 [00:19<23:27,  5.87s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating predictions:   2%|▏         | 4/243 [00:24<21:18,  5.35s/it]The following generation flags are not valid and may be ignored: ['temperature', 


📊 Classification Report (binary presence):
              precision    recall  f1-score   support

      NoSDoH       0.41      1.00      0.58        99
    Any SDoH       0.00      0.00      0.00       144

    accuracy                           0.41       243
   macro avg       0.20      0.50      0.29       243
weighted avg       0.17      0.41      0.24       243


✅ Predictions saved to ../results/model_training/llama_lora_binary_sdoh/eval_predictions.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
