# Miscellaneous code 

## Setup

In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import os
from pathlib import Path
import sys

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## Aesthetics

In [None]:
# Colour palette
# From https://brand.ifrc.org/ifrc-brand-system/basics/colour
colour_palette = {
    'ifrc_red': '#EE2435',
    'ifrc_darkblue': '#011E41',
    'dark_green': '#009775',
    'medium_green': '#00AB84',
    'light_green': '#47D7AC',
    'medium_blue': '#8DCDE2',
    'light_blue': '#CCf5FC',
    'medium_orange': '#FF8200',
    'light_orange': '#FFB25B',
    'medium_purple': '#512D6D',
    'light_purple': '#958DBE',
    'grey': '#A7A8AA',
}

## Background on brains & cached models

In [2]:
# Check models
# What models are available
cache_dir = "/data/resource/huggingface/hub"
available_models = []

if os.path.exists(cache_dir):
    for item in os.listdir(cache_dir):
        if item.startswith("models--"):
            # Convert models--org--name to org/name format
            model_name = item.replace("models--", "").replace("--", "/")
            available_models.append(model_name)

print("Available cached models:")
for model in sorted(available_models):
    print(f"  {model}")

Available cached models:
  CohereForAI/aya-23-35B
  CohereForAI/aya-23-8B
  CohereForAI/aya-vision-8b
  HuggingFaceTB/SmolLM-135M-Instruct
  LLaMAX/LLaMAX3-8B-Alpaca
  Qwen/Qwen1.5-4B
  Qwen/Qwen2-7B
  Qwen/Qwen2.5-1.5B
  Qwen/Qwen2.5-3B
  Qwen/Qwen2.5-72B-Instruct
  Qwen/Qwen2.5-7B
  Qwen/Qwen2.5-7B-Instruct
  Qwen/Qwen2.5-7B-instruct
  Qwen/Qwen2.5-VL-7B-Instruct
  Qwen/Qwen3-0.6B
  Qwen/Qwen3-8B
  Unbabel/wmt20-comet-qe-da
  Unbabel/wmt22-comet-da
  bert-base-uncased
  bert-large-uncased
  cardiffnlp/twitter-roberta-base-sentiment
  cardiffnlp/twitter-roberta-base-sentiment-latest
  clairebarale/refugee_cases_ner
  cross-encoder/nli-deberta-v3-large
  cross-encoder/stsb-roberta-base
  cross-encoder/stsb-roberta-large
  deepseek-ai/DeepSeek-R1-Distill-Llama-70B
  deepseek-ai/DeepSeek-R1-Distill-Llama-8B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
  deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
  facebook/nllb-200-3.3B
  facebook/nllb-200-distilled-

In [3]:
import torch

print(f"🧠 {torch.cuda.device_count()} CUDA device(s) detected:\n")
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")

🧠 4 CUDA device(s) detected:

Device 0: NVIDIA L40S
Device 1: NVIDIA L40S
Device 2: NVIDIA A100 80GB PCIe
Device 3: NVIDIA A100 80GB PCIe


## Checking prompts and training data prep

In [None]:
from scripts.llama.multilabel_direct.prepare_dataset import prepare_multilabel_dataset, prepare_multilabel_dataset_infer

val_prepared = prepare_multilabel_dataset("../data/processed/train-test/val_set.csv")
val_prepared_df = val_prepared.to_pandas()

test_prepared_df = prepare_multilabel_dataset_infer("../data/processed/train-test/test_set.csv")

In [5]:
from scripts.llama.multilabel_direct_adverse.prepare_dataset import prepare_adverse_only_dataset, prepare_adverse_only_dataset_infer

val_prepared_adverse = prepare_adverse_only_dataset("../data/processed/train-test/val_set.csv")
val_prepared_adverse_df = val_prepared_adverse.to_pandas()

test_prepared_adverse_df = prepare_adverse_only_dataset_infer("../data/processed/train-test/test_set.csv")

## Two-step pipeline

In [None]:
from scripts.multistep.two_step_pipeline import run_two_step_pipeline

run_two_step_pipeline(
        test_data_file="../data/processed/train-test/test_set.csv",
        roberta_model_dir="../results/model_training/roberta_binary_sdoh/roberta-base_bs16_lr9e-05_20250709_170452/checkpoint-24",
        llama_model_dir="../results/model_training/llama_lora_multi_label_full/Llama-3.1-8B-Instruct_bs8_lr9e-05_epochs6_20250710_164937",
        pos_weight=1.5251,
        output_file="../results/multistep/two_step_predictions.csv"
    )

### Step by step

In [10]:
# STEP 1: Setup + Read Data
import os
import pandas as pd
from scripts.roberta.dataset import is_sdoh_label

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["WANDB_MODE"] = "disabled"

test_path = "../data/processed/train-test/test_set.csv"
roberta_model_dir = "../results/model_training/roberta_binary_sdoh/roberta-base_bs16_lr9e-05_20250709_170452/checkpoint-24"
llama_model_dir = "../results/model_training/llama_lora_multi_label_full/Llama-3.1-8B-Instruct_bs8_lr9e-05_epochs6_20250710_164937"
pos_weight = 1.5251

df = pd.read_csv(test_path)
df["binary_label"] = df["completion"].apply(is_sdoh_label)
df.head()

Unnamed: 0,Sentence,label_pair,label_string,completion,binary_label
0,She is able to sit out for XXXX hours between ...,['NoSDoH'],NoSDoH,<LIST>NoSDoH</LIST>,0
1,He is currently treated with Sinemet and Ropin...,['NoSDoH'],NoSDoH,<LIST>NoSDoH</LIST>,0
2,Marker on Essex Wellbeing Record that she was ...,['NoSDoH'],NoSDoH,<LIST>NoSDoH</LIST>,0
3,"She needs help with food , toiletry and some c...","['Finances-Adverse', 'FoodAccess-Adverse']",Finances-Adverse|FoodAccess-Adverse,"<LIST>Finances-Adverse, FoodAccess-Adverse</LIST>",1
4,support to find a cleaning service in communit...,['Housing-Adverse'],Housing-Adverse,<LIST>Housing-Adverse</LIST>,1


In [11]:
# STEP 2: Load RoBERTa and Predict
from transformers import RobertaTokenizer, RobertaConfig, Trainer
from scripts.roberta.dataset import BinarySDoHDataset
from scripts.roberta.model import RobertaBinaryClassifierWithWeight
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
config = RobertaConfig.from_pretrained(roberta_model_dir)

model = RobertaBinaryClassifierWithWeight.from_pretrained(
    roberta_model_dir,
    config=config,
    pos_weight=pos_weight
)

dataset = BinarySDoHDataset(df, tokenizer)
trainer = Trainer(model=model, tokenizer=tokenizer)
outputs = trainer.predict(dataset)

  trainer = Trainer(model=model, tokenizer=tokenizer)


In [12]:
# STEP 3: Add RoBERTa Predictions
probs = torch.sigmoid(torch.tensor(outputs.predictions)).numpy().flatten()
y_pred = (probs > 0.5).astype(int)

df["roberta_prob_sdoh"] = probs
df["roberta_pred_sdoh"] = y_pred
df_roberta = df[["Sentence", "completion", "roberta_pred_sdoh", "roberta_prob_sdoh"]].copy()
df_roberta.head()

Unnamed: 0,Sentence,completion,roberta_pred_sdoh,roberta_prob_sdoh
0,She is able to sit out for XXXX hours between ...,<LIST>NoSDoH</LIST>,0,0.451048
1,He is currently treated with Sinemet and Ropin...,<LIST>NoSDoH</LIST>,0,0.044358
2,Marker on Essex Wellbeing Record that she was ...,<LIST>NoSDoH</LIST>,1,0.606594
3,"She needs help with food , toiletry and some c...","<LIST>Finances-Adverse, FoodAccess-Adverse</LIST>",1,0.968946
4,support to find a cleaning service in communit...,<LIST>Housing-Adverse</LIST>,1,0.921323


In [None]:
# STEP 5: Prepare Subset for LLaMA
df_flagged = df_roberta[df_roberta["roberta_pred_sdoh"] == 1].copy()
df_flagged.shape

(113, 4)

In [None]:
# STEP 6: Run LLaMA on Flagged Sentences
from scripts.llama.shared_utils.model import load_lora_llama
from scripts.llama.multilabel_direct.prepare_dataset import prepare_multilabel_dataset_infer
from tqdm import tqdm

model, tokenizer = load_lora_llama(
    base_model_path="meta-llama/Llama-3.1-8B-Instruct",
    adapter_path=llama_model_dir,
    cache_dir="/data/resource/huggingface/hub",
    device=0
)

df_prompted = prepare_multilabel_dataset_infer(df_flagged.copy())

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

🔗 Loading LoRA adapters from: ../results/model_training/llama_lora_multi_label_full/Llama-3.1-8B-Instruct_bs8_lr9e-05_epochs6_20250710_164937


In [9]:
# STEP 7: Generate Predictions
def extract_list_output(text):
    start, end = text.find("<LIST>"), text.find("</LIST>")
    return text[start:end+7] if start != -1 and end != -1 else "NO_LIST_FOUND"

def generate_response(prompt):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    input_len = inputs["input_ids"].shape[1]
    decoded = tokenizer.decode(output[0][input_len:], skip_special_tokens=True)
    return decoded.strip()

predictions = []
for prompt in tqdm(df_prompted["prompt"]):
    output = generate_response(prompt)
    predictions.append(extract_list_output(output))

df_prompted["generated_completion"] = predictions
df_llama = df_prompted[["Sentence", "generated_completion"]]
df_llama.head()

100%|██████████| 113/113 [01:13<00:00,  1.54it/s]


Unnamed: 0,Sentence,generated_completion
2,Marker on Essex Wellbeing Record that she was ...,<LIST>NoSDoH</LIST>
3,"She needs help with food , toiletry and some c...","<LIST>FoodAccess, Finances</LIST>"
4,support to find a cleaning service in communit...,"<LIST>Housing, Loneliness</LIST>"
5,PERSON has hearing aids & struggles with phone...,<LIST>NoSDoH</LIST>
6,The patient requires the internet to complete ...,<LIST>DigitalInclusion</LIST>


In [None]:
# STEP 8: Merge and Create Final Predictions
df_final = df_roberta.merge(df_llama, on="Sentence", how="left")
df_final["final_prediction"] = df_final.apply(
    lambda row: row["generated_completion"] if row["roberta_pred_sdoh"] == 1 else "<LIST>NoSDoH</LIST>",
    axis=1
)
df_final[["Sentence", "completion", "final_prediction"]].head()

## Create toy referrals dataset for proof-of-concept code

In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import os

# Create output directory if it doesn't exist
output_path = "../data/processed/brc-cleaned"
os.makedirs(output_path, exist_ok=True)
output_file = os.path.join(output_path, "toy_referrals.csv")

# Initialize faker
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Define synthetic SDoH phrases
sdoh_sentences = [
    "She is struggling with paying her rent and buying food.",
    "He feels very isolated and does not leave his home.",
    "The client mentioned she needs help finding a job.",
    "He is unable to use digital services on his own.",
    "She cannot understand written English very well.",
    "He has no money for transportation to the food bank.",
    "The client reported sleeping on a friend's couch.",
    "She is fluent in English and volunteers regularly.",
    "He owns a home and has a stable income.",
    "The client is digitally literate and employed full-time.",
    "She receives universal credit and struggles with bills.",
    "He asked for help accessing community food support."
]

# Generate 20 synthetic referrals
rows = []
for i in range(20):
    num_sents = random.randint(1, min(4, len(sdoh_sentences)))  # prevent sampling too many
    note = " ".join(random.sample(sdoh_sentences, k=num_sents))  # ensure unique sentences per note

    row = {
        "Area": fake.city(),
        "Scheme": fake.word(),
        "Case Reference": fake.uuid4(),
        "Assessment Result": random.choice(["Needs Met", "Needs Unmet", "Ongoing"]),
        "Case Status": random.choice(["Open", "Closed", "Pending"]),
        "Referral Date/Time": fake.date_time_this_year().isoformat(),
        "End Date Case": fake.date_this_year().isoformat(),
        "Has Disability": random.choice(["Yes", "No"]),
        "Has Risk": random.choice(["Yes", "No"]),
        "Risk Type": random.choice(["Mental Health", "Domestic", "Mobility", None]),
        "Unique Case": fake.uuid4(),
        "IMD_decile": random.randint(1, 10),
        "Country": random.choice(["England", "Wales", "Scotland"]),
        "Age": random.randint(18, 90),
        "Gender": random.choice(["Male", "Female", "Other"]),
        "Ethnicity": random.choice(["White", "Black", "Asian", "Mixed", "Other"]),
        "Disability": random.choice(["None", "Hearing", "Visual", "Mobility"]),
        "Living Arrangements": random.choice(["Alone", "With family", "With partner"]),
        "Referral Notes (depersonalised)": note,
        "case_ref": f"toy_{i:04d}",
        "num_observations": random.randint(1, 10),
        "date_range_start": fake.date_this_year().isoformat(),
        "date_range_end": fake.date_this_year().isoformat(),
        "date_range_days": random.randint(1, 120),
        "referral_date": fake.date_this_year().isoformat()
    }
    rows.append(row)

# Save to CSV
df = pd.DataFrame(rows)
df.to_csv(output_file, index=False)
print(f"✅ Saved synthetic dataset to {output_file}")

✅ Saved synthetic dataset to ../data/processed/brc-cleaned/toy_referrals.csv


## Decompostion of the inference over all referrals

In [12]:
import os
import argparse
import pandas as pd
from tqdm import tqdm
import math
from datetime import datetime

os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Use nvtop Device 1 (A100)

from scripts.multistep_adverse.two_step_pipeline import run_two_step_pipeline

def sentence_splitter(text):
    """
    Safely split a note into non-empty sentences. Returns [] if text is not a string.
    """
    return [s.strip() for s in text.split('.') if isinstance(s, str) and s.strip()]

def prepare_batched_csvs(referral_path, output_dir, batch_size):
    """
    Batches the referral DataFrame by unique case_ref, preserving full notes.
    
    Args:
        referral_path (str): Path to CSV file with referral data.
        output_dir (str): Directory to save batch CSVs.
        batch_size (int): Number of cases per batch.
    
    Returns:
        List[str]: List of file paths to the saved batch CSVs.
    """
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(referral_path)
    df = df[df["Referral Notes (depersonalised)"].notnull()].reset_index(drop=True)
    
    # Keep only required columns for inference
    df = df[["case_ref", "Referral Notes (depersonalised)"]].copy()
    df = df.rename(columns={"Referral Notes (depersonalised)": "referral_note"})

    total = len(df)
    num_batches = math.ceil(total / batch_size)

    batch_paths = []
    for i in range(num_batches):
        batch_df = df.iloc[i*batch_size:(i+1)*batch_size]
        batch_path = os.path.join(output_dir, f"batch_{i:03d}.csv")
        batch_df.to_csv(batch_path, index=False)
        batch_paths.append(batch_path)

    return batch_paths

In [13]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_output_dir = os.path.join("../results/inference/full_referrals", f"test_{timestamp}")
os.makedirs(base_output_dir, exist_ok=True)

referral_path = "../data/processed/brc-cleaned/toy_referrals.csv"

batch_paths = prepare_batched_csvs(referral_path, os.path.join(base_output_dir, "batches"), 5)

test_batch_path = batch_paths[0]  # Use the first batch for testing

In [14]:
out_path = os.path.join(base_output_dir, f"predictions_batch_{0:03d}.csv")

# Load case-level batch
df_batch = pd.read_csv(test_batch_path)

In [15]:
all_sentences = []
for _, row in df_batch.iterrows():
    case_ref = row["case_ref"]
    note = row["referral_note"]
    for sentence in sentence_splitter(note):
        all_sentences.append({
            "case_ref": case_ref,
            "Sentence": sentence
        })
df_sentences = pd.DataFrame(all_sentences)

In [None]:
# Save to temp file
temp_path = os.path.join(base_output_dir, f"temp_sentences_batch_{0:03d}.csv")
df_sentences.to_csv(temp_path, index=False)

print(f"✅ Saved batched sentences to {temp_path}")

# Run two-step model
run_two_step_pipeline(
    data_file=temp_path,
    roberta_model_dir="../results/model_training/roberta_binary/best_model/roberta-base_bs4_lr7e-05_20250726_140551/checkpoint-66",
    llama_model_dir="../results/model_training/llama_multilabel_direct_adverse/best_model/Llama-3.1-8B-Instruct_bs8_lr3e-05_epochs6_20250726_031804",
    pos_weight=1.1757,
    output_file=out_path
)

os.remove(temp_path)  # clean up

In [18]:
import os
import pandas as pd

os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Use nvtop Device 1 (A100)
os.environ["WANDB_MODE"] = "disabled"

import torch
from transformers import RobertaTokenizer, RobertaConfig, Trainer
from tqdm import tqdm

from scripts.roberta.dataset import BinarySDoHDataset, is_sdoh_label
from scripts.roberta.model import RobertaBinaryClassifierWithWeight

from scripts.llama.shared_utils.model import load_lora_llama
from scripts.llama.multilabel_direct_adverse.prepare_dataset import prepare_adverse_only_dataset_infer, strip_protective_labels

# Load test data
df = pd.read_csv(temp_path)
if "completion" in df.columns:
    df["binary_label"] = df["completion"].apply(is_sdoh_label)

# Load tokenizer and config from trained model directory
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
config = RobertaConfig.from_pretrained("../results/model_training/roberta_binary/best_model/roberta-base_bs4_lr7e-05_20250726_140551/checkpoint-66")

# Load model with pos_weight
model = RobertaBinaryClassifierWithWeight.from_pretrained(
    "../results/model_training/roberta_binary/best_model/roberta-base_bs4_lr7e-05_20250726_140551/checkpoint-66",
    config=config,
    pos_weight=1.1757
)


In [20]:
# Dataset
dataset = BinarySDoHDataset(df, tokenizer)

# Trainer
trainer = Trainer(model=model, tokenizer=tokenizer)
outputs = trainer.predict(dataset)

# Get predictions
probs = torch.sigmoid(torch.tensor(outputs.predictions)).numpy().flatten()
y_pred = (probs > 0.4).astype(int) # Updated threshold

# Add predictions to DataFrame
df["roberta_prob_sdoh"] = probs
df["roberta_pred_sdoh"] = y_pred

  trainer = Trainer(model=model, tokenizer=tokenizer)


In [26]:
from scripts.multistep_adverse.two_step_pipeline import run_llama_on_flagged_sentences

# Step 2: LLaMA
llama_df = run_llama_on_flagged_sentences(
    df_flagged=df[df["roberta_pred_sdoh"] == 1],
    model_dir="../results/model_training/llama_multilabel_direct_adverse/best_model/Llama-3.1-8B-Instruct_bs8_lr3e-05_epochs6_20250726_031804"
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

🔗 Loading LoRA adapters from: ../results/model_training/llama_multilabel_direct_adverse/best_model/Llama-3.1-8B-Instruct_bs8_lr3e-05_epochs6_20250726_031804


LLaMA predictions: 100%|██████████| 11/11 [00:09<00:00,  1.21it/s]


In [27]:
# Merge and fill
# Merge and fill
merge_cols = ["Sentence"]
if "case_ref" in df.columns and "case_ref" in llama_df.columns:
    merge_cols.insert(0, "case_ref")

final_df = df.merge(llama_df, on=merge_cols, how="left")
final_df["final_prediction"] = final_df.apply(
    lambda row: row["generated_completion"] if row["roberta_pred_sdoh"] == 1 else "<LIST>NoSDoH</LIST>",
    axis=1
)