# Curation and creation of data for LLM finetuning

## 0. Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import json

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## 1. Annotation train-test split

In [15]:
from src.annotation.parse_annotations_helpers import parse_labelstudio_json
from sklearn.model_selection import train_test_split

round1_labelstudio_path = "../data/processed/annotations/label-studio/label-studio-annotations-2025-06-29-17-28-05ccd4c1.json"

# === Step 0: Clean and group as before ===
df = parse_labelstudio_json(round1_labelstudio_path)
df = df[df["SDoH"].notnull()]
df["SDoH"] = df["SDoH"].str.strip()
df["Polarity"] = df["Polarity"].fillna("").str.strip()

def make_label(sdoh, polarity):
    sdoh = sdoh.strip()
    polarity = polarity.strip()
    
    # Normalize space for "No SDoH"
    if sdoh.lower().replace(" ", "") == "nosdoh":
        return "NoSDoH"
    else:
        return f"{sdoh}-{polarity}"

df["label_pair"] = df.apply(lambda row: make_label(row["SDoH"], row["Polarity"]), axis=1)

# Group per sentence
sentence_labels = (
    df.groupby("Sentence")["label_pair"]
    .apply(lambda x: sorted(set(x)))
    .reset_index()
)
sentence_labels["label_string"] = sentence_labels["label_pair"].apply(lambda x: "|".join(x))

# === Step 1: Separate singleton label groups ===
label_counts = sentence_labels["label_string"].value_counts()
singleton_labels = label_counts[label_counts == 1].index

# Split into regular and singleton groups
non_singletons = sentence_labels[sentence_labels["label_string"].isin(label_counts[label_counts > 1].index)]
singletons = sentence_labels[sentence_labels["label_string"].isin(singleton_labels)]

# === Step 2: Stratified split for non-singletons ===
train_ns, test_ns = train_test_split(
    non_singletons,
    test_size=0.3,
    stratify=non_singletons["label_string"],
    random_state=42
)

# === Step 3: Manual split for singletons (e.g. 70/30) ===
np.random.seed(42)
singleton_shuffled = singletons.sample(frac=1.0, random_state=42)
n_singleton_train = int(len(singleton_shuffled) * 0.7)

train_singletons = singleton_shuffled.iloc[:n_singleton_train]
test_singletons = singleton_shuffled.iloc[n_singleton_train:]

# === Step 4: Combine final splits ===
train_set = pd.concat([train_ns, train_singletons], ignore_index=True)
test_set = pd.concat([test_ns, test_singletons], ignore_index=True)

# === Step 5: Format completions for prompting or fine-tuning ===
train_set["completion"] = train_set["label_pair"].apply(lambda x: "<LIST>" + ", ".join(x) + "</LIST>")
test_set["completion"] = test_set["label_pair"].apply(lambda x: "<LIST>" + ", ".join(x) + "</LIST>")

# Save the train and test sets
train_set.to_csv("../data/processed/train-test/train_set.csv", index=False)
test_set.to_csv("../data/processed/train-test/test_set.csv", index=False)

print(f"Train size: {len(train_set)}, Test size: {len(test_set)}")
print(f"Included {len(singletons)} singleton label combinations (distributed manually)")


Train size: 564, Test size: 243
Included 11 singleton label combinations (distributed manually)


## 2. Curation of manually annotated data

In [None]:
from src.classification.prompt_creation_helpers import create_automated_prompt
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

# === Step 2: Group SDoH-Polarity pairs per sentence ===
def format_label(sdoh, polarity):
    return f"{sdoh.strip()}-{polarity.strip()}"

df["label_pair"] = df.apply(lambda row: format_label(row["SDoH"], row["Polarity"]), axis=1)

grouped = (
    df.groupby("Sentence")["label_pair"]
    .apply(lambda labels: "<LIST>" + ", ".join(sorted(set(labels))) + "</LIST>")
    .reset_index()
    .rename(columns={"label_pair": "completion"})
)

# === Step 3: Generate prompts
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

grouped["prompt"] = grouped["Sentence"].apply(
    lambda s: create_automated_prompt(s, tokenizer=tokenizer, prompt_type="five_shot_basic")
)

# === Step 4: Final dataset
finetune_dataset = Dataset.from_pandas(grouped[["prompt", "completion"]])

In [11]:
# Output example
print(finetune_dataset[0]["prompt"])
print(finetune_dataset[0]["completion"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are analyzing a referral note sentence to identify Social Determinants of Health, and classifying them as Adverse or Protective.

Given a sentence, output all SDoH factors that can be inferred from that sentence from the following list: 
Loneliness, Housing, Finances, FoodAccess, Digital, Employment, EnglishProficiency.

Each SDoH must be classified as either "Adverse" or "Protective". 
If the sentence does NOT mention any of the above categories, output <LIST>NoSDoH</LIST>.

Your response must be a comma-separated list of SDoH-Polarity pairs embedded in <LIST> and </LIST> tags.

**STRICT RULES**:
- DO NOT generate any other text, explanations, or new SDoH labels.
- A sentence CAN be labeled with one or more SDoH factors.
- The only accepted format is <LIST>...</LIST>.

EXAMPLES:
Input: "She is unemployed and struggles to pay rent."
Output: <LIST>Employment-Adverse, Finances-Adverse, Housing-Adverse</LIST>

Input: "We are

## 3. Creation of synthetic data