# Miscellaneous code 

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import sys

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## Aesthetics

In [None]:
# Colour palette
# From https://brand.ifrc.org/ifrc-brand-system/basics/colour
colour_palette = {
    'ifrc_red': '#EE2435',
    'ifrc_darkblue': '#011E41',
    'dark_green': '#009775',
    'medium_green': '#00AB84',
    'light_green': '#47D7AC',
    'medium_blue': '#8DCDE2',
    'light_blue': '#CCf5FC',
    'medium_orange': '#FF8200',
    'light_orange': '#FFB25B',
    'medium_purple': '#512D6D',
    'light_purple': '#958DBE',
    'grey': '#A7A8AA',
}

In [2]:
# Check models
# What models are available
cache_dir = "/data/resource/huggingface/hub"
available_models = []

if os.path.exists(cache_dir):
    for item in os.listdir(cache_dir):
        if item.startswith("models--"):
            # Convert models--org--name to org/name format
            model_name = item.replace("models--", "").replace("--", "/")
            available_models.append(model_name)

print("Available cached models:")
for model in sorted(available_models):
    print(f"  {model}")

Available cached models:
  CohereForAI/aya-23-35B
  CohereForAI/aya-23-8B
  CohereForAI/aya-vision-8b
  HuggingFaceTB/SmolLM-135M-Instruct
  LLaMAX/LLaMAX3-8B-Alpaca
  Qwen/Qwen1.5-4B
  Qwen/Qwen2-7B
  Qwen/Qwen2.5-1.5B
  Qwen/Qwen2.5-3B
  Qwen/Qwen2.5-72B-Instruct
  Qwen/Qwen2.5-7B
  Qwen/Qwen2.5-7B-Instruct
  Qwen/Qwen2.5-7B-instruct
  Qwen/Qwen2.5-VL-7B-Instruct
  Qwen/Qwen3-0.6B
  Qwen/Qwen3-8B
  Unbabel/wmt20-comet-qe-da
  Unbabel/wmt22-comet-da
  bert-base-uncased
  bert-large-uncased
  cardiffnlp/twitter-roberta-base-sentiment
  cardiffnlp/twitter-roberta-base-sentiment-latest
  clairebarale/refugee_cases_ner
  cross-encoder/nli-deberta-v3-large
  cross-encoder/stsb-roberta-base
  cross-encoder/stsb-roberta-large
  deepseek-ai/DeepSeek-R1-Distill-Llama-70B
  deepseek-ai/DeepSeek-R1-Distill-Llama-8B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
  facebook/nllb-200-3.3B
  facebook/nllb-200-distilled-

In [3]:
import torch

print(f"🧠 {torch.cuda.device_count()} CUDA device(s) detected:\n")
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")

🧠 4 CUDA device(s) detected:

Device 0: NVIDIA L40S
Device 1: NVIDIA L40S
Device 2: NVIDIA A100 80GB PCIe
Device 3: NVIDIA A100 80GB PCIe


In [6]:
from scripts.llama.multi_label_full.prepare_dataset import prepare_multilabel_dataset
from scripts.llama.multi_label_full.prepare_dataset import prepare_multilabel_dataset_infer

val_prepared = prepare_multilabel_dataset("../data/processed/train-test/val_set.csv")
val_prepared_df = val_prepared.to_pandas()

test_prepared_df = prepare_multilabel_dataset_infer("../data/processed/train-test/test_set.csv")

In [5]:
from scripts.llama.multi_label_full_adverse.prepare_dataset import prepare_adverse_only_dataset, prepare_adverse_only_dataset_infer

val_prepared_adverse = prepare_adverse_only_dataset("../data/processed/train-test/val_set.csv")
val_prepared_adverse_df = val_prepared_adverse.to_pandas()

test_prepared_adverse_df = prepare_adverse_only_dataset_infer("../data/processed/train-test/test_set.csv")

## Two-step pipeline

In [None]:
from scripts.multistep.two_step_pipeline import run_two_step_pipeline

run_two_step_pipeline(
        test_data_file="../data/processed/train-test/test_set.csv",
        roberta_model_dir="../results/model_training/roberta_binary_sdoh/roberta-base_bs16_lr9e-05_20250709_170452/checkpoint-24",
        llama_model_dir="../results/model_training/llama_lora_multi_label_full/Llama-3.1-8B-Instruct_bs8_lr9e-05_epochs6_20250710_164937",
        pos_weight=1.5251,
        output_file="../results/multistep/two_step_predictions.csv"
    )

### Step by step

In [10]:
# STEP 1: Setup + Read Data
import os
import pandas as pd
from scripts.roberta.dataset import is_sdoh_label

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["WANDB_MODE"] = "disabled"

test_path = "../data/processed/train-test/test_set.csv"
roberta_model_dir = "../results/model_training/roberta_binary_sdoh/roberta-base_bs16_lr9e-05_20250709_170452/checkpoint-24"
llama_model_dir = "../results/model_training/llama_lora_multi_label_full/Llama-3.1-8B-Instruct_bs8_lr9e-05_epochs6_20250710_164937"
pos_weight = 1.5251

df = pd.read_csv(test_path)
df["binary_label"] = df["completion"].apply(is_sdoh_label)
df.head()

Unnamed: 0,Sentence,label_pair,label_string,completion,binary_label
0,She is able to sit out for XXXX hours between ...,['NoSDoH'],NoSDoH,<LIST>NoSDoH</LIST>,0
1,He is currently treated with Sinemet and Ropin...,['NoSDoH'],NoSDoH,<LIST>NoSDoH</LIST>,0
2,Marker on Essex Wellbeing Record that she was ...,['NoSDoH'],NoSDoH,<LIST>NoSDoH</LIST>,0
3,"She needs help with food , toiletry and some c...","['Finances-Adverse', 'FoodAccess-Adverse']",Finances-Adverse|FoodAccess-Adverse,"<LIST>Finances-Adverse, FoodAccess-Adverse</LIST>",1
4,support to find a cleaning service in communit...,['Housing-Adverse'],Housing-Adverse,<LIST>Housing-Adverse</LIST>,1


In [11]:
# STEP 2: Load RoBERTa and Predict
from transformers import RobertaTokenizer, RobertaConfig, Trainer
from scripts.roberta.dataset import BinarySDoHDataset
from scripts.roberta.model import RobertaBinaryClassifierWithWeight
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
config = RobertaConfig.from_pretrained(roberta_model_dir)

model = RobertaBinaryClassifierWithWeight.from_pretrained(
    roberta_model_dir,
    config=config,
    pos_weight=pos_weight
)

dataset = BinarySDoHDataset(df, tokenizer)
trainer = Trainer(model=model, tokenizer=tokenizer)
outputs = trainer.predict(dataset)

  trainer = Trainer(model=model, tokenizer=tokenizer)


In [12]:
# STEP 3: Add RoBERTa Predictions
probs = torch.sigmoid(torch.tensor(outputs.predictions)).numpy().flatten()
y_pred = (probs > 0.5).astype(int)

df["roberta_prob_sdoh"] = probs
df["roberta_pred_sdoh"] = y_pred
df_roberta = df[["Sentence", "completion", "roberta_pred_sdoh", "roberta_prob_sdoh"]].copy()
df_roberta.head()

Unnamed: 0,Sentence,completion,roberta_pred_sdoh,roberta_prob_sdoh
0,She is able to sit out for XXXX hours between ...,<LIST>NoSDoH</LIST>,0,0.451048
1,He is currently treated with Sinemet and Ropin...,<LIST>NoSDoH</LIST>,0,0.044358
2,Marker on Essex Wellbeing Record that she was ...,<LIST>NoSDoH</LIST>,1,0.606594
3,"She needs help with food , toiletry and some c...","<LIST>Finances-Adverse, FoodAccess-Adverse</LIST>",1,0.968946
4,support to find a cleaning service in communit...,<LIST>Housing-Adverse</LIST>,1,0.921323


In [None]:
# STEP 5: Prepare Subset for LLaMA
df_flagged = df_roberta[df_roberta["roberta_pred_sdoh"] == 1].copy()
df_flagged.shape

(113, 4)

In [8]:
# STEP 6: Run LLaMA on Flagged Sentences
from scripts.llama.shared_utils.model import load_lora_llama
from scripts.llama.multi_label_full.prepare_dataset import prepare_multilabel_dataset_infer
from tqdm import tqdm

model, tokenizer = load_lora_llama(
    base_model_path="meta-llama/Llama-3.1-8B-Instruct",
    adapter_path=llama_model_dir,
    cache_dir="/data/resource/huggingface/hub",
    device=0
)

df_prompted = prepare_multilabel_dataset_infer(df_flagged.copy())

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

🔗 Loading LoRA adapters from: ../results/model_training/llama_lora_multi_label_full/Llama-3.1-8B-Instruct_bs8_lr9e-05_epochs6_20250710_164937


In [9]:
# STEP 7: Generate Predictions
def extract_list_output(text):
    start, end = text.find("<LIST>"), text.find("</LIST>")
    return text[start:end+7] if start != -1 and end != -1 else "NO_LIST_FOUND"

def generate_response(prompt):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    input_len = inputs["input_ids"].shape[1]
    decoded = tokenizer.decode(output[0][input_len:], skip_special_tokens=True)
    return decoded.strip()

predictions = []
for prompt in tqdm(df_prompted["prompt"]):
    output = generate_response(prompt)
    predictions.append(extract_list_output(output))

df_prompted["generated_completion"] = predictions
df_llama = df_prompted[["Sentence", "generated_completion"]]
df_llama.head()

100%|██████████| 113/113 [01:13<00:00,  1.54it/s]


Unnamed: 0,Sentence,generated_completion
2,Marker on Essex Wellbeing Record that she was ...,<LIST>NoSDoH</LIST>
3,"She needs help with food , toiletry and some c...","<LIST>FoodAccess, Finances</LIST>"
4,support to find a cleaning service in communit...,"<LIST>Housing, Loneliness</LIST>"
5,PERSON has hearing aids & struggles with phone...,<LIST>NoSDoH</LIST>
6,The patient requires the internet to complete ...,<LIST>DigitalInclusion</LIST>


In [None]:
# STEP 8: Merge and Create Final Predictions
df_final = df_roberta.merge(df_llama, on="Sentence", how="left")
df_final["final_prediction"] = df_final.apply(
    lambda row: row["generated_completion"] if row["roberta_pred_sdoh"] == 1 else "<LIST>NoSDoH</LIST>",
    axis=1
)
df_final[["Sentence", "completion", "final_prediction"]].head()