# Curation and creation of data for LLM finetuning

## 0. Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from pathlib import Path
import sys
import json

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## 1. Curation from manually annotated data

In [10]:
from src.annotation.parse_annotations_helpers import parse_labelstudio_json

round1_labelstudio_path = "../data/processed/annotations/label-studio/label-studio-annotations-2025-06-26-round1.json"

from src.classification.prompt_creation_helpers import create_automated_prompt
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

# === Step 1: Load long-form parsed annotations ===
df = parse_labelstudio_json(round1_labelstudio_path)

# Drop incomplete rows
df = df[df["SDoH"].notnull() & df["Polarity"].notnull()]
df = df[df["SDoH"].str.strip() != ""]

# === Step 2: Group SDoH-Polarity pairs per sentence ===
def format_label(sdoh, polarity):
    return f"{sdoh.strip()}-{polarity.strip()}"

df["label_pair"] = df.apply(lambda row: format_label(row["SDoH"], row["Polarity"]), axis=1)

grouped = (
    df.groupby("Sentence")["label_pair"]
    .apply(lambda labels: "<LIST>" + ", ".join(sorted(set(labels))) + "</LIST>")
    .reset_index()
    .rename(columns={"label_pair": "completion"})
)

# === Step 3: Generate prompts
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

grouped["prompt"] = grouped["Sentence"].apply(
    lambda s: create_automated_prompt(s, tokenizer=tokenizer, prompt_type="five_shot_basic")
)

# === Step 4: Final dataset
finetune_dataset = Dataset.from_pandas(grouped[["prompt", "completion"]])

In [11]:
# Output example
print(finetune_dataset[0]["prompt"])
print(finetune_dataset[0]["completion"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are analyzing a referral note sentence to identify Social Determinants of Health, and classifying them as Adverse or Protective.

Given a sentence, output all SDoH factors that can be inferred from that sentence from the following list: 
Loneliness, Housing, Finances, FoodAccess, Digital, Employment, EnglishProficiency.

Each SDoH must be classified as either "Adverse" or "Protective". 
If the sentence does NOT mention any of the above categories, output <LIST>NoSDoH</LIST>.

Your response must be a comma-separated list of SDoH-Polarity pairs embedded in <LIST> and </LIST> tags.

**STRICT RULES**:
- DO NOT generate any other text, explanations, or new SDoH labels.
- A sentence CAN be labeled with one or more SDoH factors.
- The only accepted format is <LIST>...</LIST>.

EXAMPLES:
Input: "She is unemployed and struggles to pay rent."
Output: <LIST>Employment-Adverse, Finances-Adverse, Housing-Adverse</LIST>

Input: "We are

## 2. Creation of synthetic data