# Stage 5.2 – Key Moment Recall Pilot
This notebook prepares the Method 2 scaffolding for the key-moment recall workflow (Stage 5.2). It mirrors the Stage 5.1 logic while re-pointing long-form respondents to the short-form key-moment event lists.

## Workflow Overview
1. Load Stage 5.1 assets (model events plus recall responses).
2. Derive an Abbott Elementary pilot sample (3 long-form, 3 short-form).
3. Generate the revised key-moment prompts for manual review (Step 3.2).
4. Execute LLM scoring only after approval (Steps 3.3-3.6).

In [1]:
import os
import time
from pathlib import Path
import json
import re
import textwrap
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd

try:
    from openai import OpenAI
except ImportError:
    OpenAI = None


def find_project_root() -> Path:
    candidates = [Path.cwd().resolve(), *Path.cwd().resolve().parents]
    for candidate in candidates:
        if (candidate / "analysis" / "assemble_uv.ipynb").exists():
            return candidate
    raise FileNotFoundError("Unable to locate project root (missing analysis/assemble_uv.ipynb).")


PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / "data"
RESULTS_DIR = PROJECT_ROOT / "results"
KEY_MOMENT_DIR = PROJECT_ROOT / "recall_openended" / "key moment"
PILOT_OUTPUT_DIR = KEY_MOMENT_DIR / "pilot_outputs"
PILOT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_EVENTS_PATH = DATA_DIR / "model_answers_events.md"
RECALL_SOURCE_PATH = RESULTS_DIR / "uv_open_ended_long_recall.csv"
PILOT_OUTPUT_PATH = PILOT_OUTPUT_DIR / "recall_coded_responses_key_moment_pilot.csv"

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=OPENAI_API_KEY) if (OpenAI and OPENAI_API_KEY) else None

pd.set_option("display.max_colwidth", 200)

print(f"Project root: {PROJECT_ROOT}")
print(f"Recall source path: {RECALL_SOURCE_PATH.relative_to(PROJECT_ROOT)}")
print(f"Pilot output path: {PILOT_OUTPUT_PATH.relative_to(PROJECT_ROOT)}")
print(f"OpenAI client available: {openai_client is not None}")

Project root: C:\Users\ashra\Documents\NeuralSense\NeuralData\clients\544_WBD_CXCU
Recall source path: results\uv_open_ended_long_recall.csv
Pilot output path: recall_openended\key moment\pilot_outputs\recall_coded_responses_key_moment_pilot.csv
OpenAI client available: True


In [22]:
def normalise_title(title: str) -> str:
    if not isinstance(title, str):
        return ""
    cleaned = re.sub(r"\s+", " ", title.strip()).lower()
    cleaned = cleaned.replace("–", "-").replace("—", "-")
    cleaned = cleaned.replace(":", "")
    return cleaned


def normalise_form(form: str) -> str:
    if not isinstance(form, str):
        return ""
    cleaned = form.strip().lower().replace("–", "-").replace("—", "-")
    cleaned = re.sub(r"\s+", " ", cleaned)
    cleaned = cleaned.replace(" form", "")
    alias_map = {
        "long": "long",
        "long-form": "long",
        "longform": "long",
        "lf": "long",
        "short": "short",
        "short-form": "short",
        "shortform": "short",
        "sf": "short",
    }
    return alias_map.get(cleaned, cleaned)


def parse_model_events(path: Path) -> Dict[Tuple[str, str], List[str]]:
    text = path.read_text(encoding="utf-8")
    # Match headers like "## Abbot Elementary – Long Form" or "## Mad Max - Short Form"
    header_pattern = re.compile(r"^##\s*(.+?)\s*[-–—]\s*(.+?)\s*$", re.MULTILINE)
    matches = list(header_pattern.finditer(text))
    sections: Dict[Tuple[str, str], List[str]] = {}
    if not matches:
        raise ValueError("No section headers found in model_answers_events.md")
    for idx, match in enumerate(matches):
        title_raw, form_raw = match.group(1).strip(), match.group(2).strip()
        start = match.end()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
        section_text = text[start:end]
        events = [
            evt.strip()
            for evt in re.findall(r"^\s*\d+\.\s+(.*)$", section_text, re.MULTILINE)
            if evt.strip()
        ]
        key = (normalise_title(title_raw), normalise_form(form_raw))
        sections[key] = events
    return sections


def load_recall_responses(path: Path) -> pd.DataFrame:
    try:
        df = pd.read_csv(path)
    except UnicodeDecodeError:
        with path.open("r", encoding="cp1252", errors="ignore") as fh:
            df = pd.read_csv(fh)
    df.columns = [col.strip().lower() for col in df.columns]
    column_aliases = {
        "format": "form",
        "media_format": "form",
        "media_form": "form",
        "content_form": "form",
    }
    for candidate, target in column_aliases.items():
        if candidate in df.columns and target not in df.columns:
            df = df.rename(columns={candidate: target})
    if "id" not in df.columns:
        df.insert(0, "id", range(len(df)))
    required = {"id", "title", "form", "respondent", "response"}
    missing = required - set(df.columns)
    if missing:
        raise KeyError(f"Recall dataset missing required columns: {sorted(missing)}")
    df["title"] = df["title"].astype(str).str.strip()
    df["form"] = df["form"].astype(str).str.strip()
    df["respondent"] = df["respondent"].astype(str).str.strip()
    df["response"] = df["response"].fillna("").astype(str)
    if "question_code" in df.columns:
        df["question_code"] = df["question_code"].astype(str).str.strip()
    if "questionnaire" in df.columns:
        df["questionnaire"] = df["questionnaire"].astype(str).str.strip()
    df["title_key"] = df["title"].apply(normalise_title)
    df["form_key"] = df["form"].apply(normalise_form)
    return df


ABBOTT_TITLE_KEY = normalise_title("Abbot Elementary")

In [23]:
model_events_lookup = parse_model_events(MODEL_EVENTS_PATH)
recall_df = load_recall_responses(RECALL_SOURCE_PATH)

# Normalize Abbott/Abbot spelling variations to match model events
recall_df['title'] = recall_df['title'].replace({
    'Abbot Elementary': 'Abbot Elementary',
    'Abbott Elementary': 'Abbot Elementary'
})
recall_df['title_key'] = recall_df['title'].apply(normalise_title)

print(f"Loaded model events for {len(model_events_lookup)} title/form combinations.")
print(f"Total recall responses: {len(recall_df)}")
print(f"Forms present: {sorted(recall_df['form_key'].unique())}")
abbott_mask = recall_df['title_key'] == ABBOTT_TITLE_KEY
print(f"Abbott Elementary responses: {int(abbott_mask.sum())}")
recall_df.head(3)

Loaded model events for 6 title/form combinations.
Total recall responses: 162
Forms present: ['long', 'short']
Abbott Elementary responses: 58


Unnamed: 0,id,respondent,group,questionnaire,question_code,question,form,title,response,title_key,form_key
0,0,116,A,Post,Q13,Recall,Long,Abbot Elementary,The black male actor lied about liking pizza and said he would only eat a certain type which he thought didn’t exist. He described it as wet. The white male actor drove to the city where the pizza...,abbot elementary,long
1,1,107,F,Post,Q13,Recall,Long,Abbot Elementary,The guy grabbing the pizza in reality doesn’t like pizza but he is ashamed to admit at first because he doesn’t want to be judged. So he made up a fake pizza place from Baltimore that is extra sog...,abbot elementary,long
2,2,109,F,Post,Q13,Recall,Long,Abbot Elementary,"Greg, Jacob and staff were having lunch in the break room and the group started talking about pizza and decided that they should have a pizza contest where everyone brings their favorite pizza for...",abbot elementary,long


In [24]:
def build_pilot_sample(responses: pd.DataFrame, title_key: str, *, n_long: int = 3, n_short: int = 3, seed: int = 20251111) -> pd.DataFrame:
    target = responses.loc[responses["title_key"] == title_key].copy()
    if target.empty:
        raise ValueError(f"No responses found for title key '{title_key}'.")
    long_pool = target[target["form_key"] == "long"]
    short_pool = target[target["form_key"] == "short"]
    if long_pool.empty:
        raise ValueError("No long-form respondents available for the pilot selection.")
    if short_pool.empty:
        raise ValueError("No short-form respondents available for the pilot selection.")
    long_sample = long_pool.sample(n=min(n_long, len(long_pool)), random_state=seed, replace=False)
    short_sample = short_pool.sample(n=min(n_short, len(short_pool)), random_state=seed + 1, replace=False)
    pilot = pd.concat([long_sample, short_sample], ignore_index=True)
    pilot = pilot.sort_values(["form_key", "respondent", "id"]).reset_index(drop=True)
    pilot["pilot_order"] = range(1, len(pilot) + 1)
    return pilot

In [25]:
PILOT_RANDOM_SEED = 20251111
pilot_df = build_pilot_sample(recall_df, ABBOTT_TITLE_KEY, seed=PILOT_RANDOM_SEED)

summary = pilot_df.groupby("form_key").size().to_dict()
print(f"Pilot sample size: {len(pilot_df)}")
print(f"Pilot composition (by form): {summary}")
pilot_preview_cols = ["pilot_order", "id", "respondent", "form", "title", "question_code"]
pilot_df.loc[:, pilot_preview_cols]

Pilot sample size: 6
Pilot composition (by form): {'long': 3, 'short': 3}


Unnamed: 0,pilot_order,id,respondent,form,title,question_code
0,1,6,41,Long,Abbot Elementary,Q13
1,2,90,62,Long,Abbot Elementary,Q18
2,3,12,85,Long,Abbot Elementary,Q13
3,4,45,108,Short,Abbot Elementary,Q13
4,5,48,40,Short,Abbot Elementary,Q13
5,6,50,69,Short,Abbot Elementary,Q13


In [26]:
PILOT_SAMPLE_PATH = PILOT_OUTPUT_DIR / "keymoment_pilot_selection.csv"
pilot_df.to_csv(PILOT_SAMPLE_PATH, index=False)
print(f"Pilot selection persisted to {PILOT_SAMPLE_PATH.relative_to(PROJECT_ROOT)}")

Pilot selection persisted to recall_openended\key moment\pilot_outputs\keymoment_pilot_selection.csv


## Prompt Scaffolding

In [34]:
SYSTEM_PROMPT = textwrap.dedent(
    """
    You are an expert qualitative coder focusing on key-moment recall for media research.
    Scoring guidelines:
    - The MODEL EVENTS describe the short-form key moment. Apply them even when the participant viewed the long-form cut.
    - Compare the participant response to short-form key moment events and assess number of events recalled, accuracy, specificity, and ordering.
    - Return strictly JSON with the following fields (integers for the scores):
        - "recall_score": 0-100 (0 = no relevant recall, 100 = richly detailed and accurate).
        - "confidence_score": 0-100 reflecting your certainty in the judgment.
        - "rationale": 1-3 sentence explanation referencing the MODEL EVENTS.
    - If the response states they do not remember, set recall_score to 0 and confidence_score to at least 90.
    - Ignore long-form events or details that fall outside the MODEL EVENTS (they should not raise the score).
    - Never invent events beyond the list, and do not include commentary outside the JSON payload.
    """
).strip()

In [45]:
def describe_event_source(requested_form: str, applied_form: str) -> str:
    if not applied_form:
        return "No model events located for this title."
    if requested_form == applied_form:
        return f"{applied_form.title()} form events"
    if applied_form == "short":
        return "Short form key-moment events (applied to long-form response)"
    return f"{applied_form.title()} form events (fallback; requested {requested_form})"


def get_key_moment_events(row: pd.Series, events_lookup: Dict[Tuple[str, str], List[str]]) -> Tuple[List[str], str]:
    title_key = row["title_key"]
    requested_form = row["form_key"]
    if requested_form == "long":
        preferred_key = (title_key, "short")
        if preferred_key in events_lookup:
            return events_lookup[preferred_key], "short"
    primary_key = (title_key, requested_form)
    if primary_key in events_lookup:
        return events_lookup[primary_key], requested_form
    fallback_short = (title_key, "short")
    if fallback_short in events_lookup:
        return events_lookup[fallback_short], "short"
    fallback_long = (title_key, "long")
    if fallback_long in events_lookup:
        return events_lookup[fallback_long], "long"
    return [], ""


def build_prompt_block(row: pd.Series, events: List[str], event_source_label: str) -> str:
    events_text = "\n".join(f"{idx + 1}. {event}" for idx, event in enumerate(events)) if events else "(No model events available.)"
    response_text = row.get("response", "")
    question_code = row.get("question_code", "")
    return textwrap.dedent(
        f"""
        Title: {row['title']}
        Respondent form: {row['form']}
        Event list source: {event_source_label}
        Question code: {question_code}
        Row ID: {row['id']}

        MODEL EVENTS (chronological):
        {events_text}

        PARTICIPANT RESPONSE:
        {response_text}

        Evaluate this response and return a JSON object with keys id, recall_score, confidence_score, rationale.
        """
    ).strip()


def build_batch_prompt(batch_rows: pd.DataFrame, events_lookup: Dict[Tuple[str, str], List[str]]) -> Tuple[str, List[Tuple[str, str]], pd.DataFrame]:
    blocks: List[str] = []
    missing_keys: List[Tuple[str, str]] = []
    metadata_records: List[Dict[str, object]] = []
    for _, row in batch_rows.iterrows():
        events, applied_form = get_key_moment_events(row, events_lookup)
        requested_form = row["form_key"]
        if not events:
            missing_keys.append((row["title_key"], requested_form))
        label = describe_event_source(requested_form, applied_form)
        blocks.append(build_prompt_block(row, events, label))
        metadata_records.append({
            "id": int(row["id"]),
            "respondent": row.get("respondent"),
            "title": row.get("title"),
            "form_requested": row.get("form"),
            "event_form_used": applied_form or "missing",
            "event_count": len(events),
            "event_label": label,
        })
    prompt_text = "\n\n".join(blocks)
    metadata_df = pd.DataFrame(metadata_records)
    return prompt_text, missing_keys, metadata_df


def call_llm_batch(prompt: str, client_obj=openai_client, model: str = "gpt-4.1", max_retries: int = 3, sleep_seconds: float = 2.0) -> str:
    if client_obj is None:
        raise RuntimeError("OpenAI client is not initialised. Set OPENAI_API_KEY before calling the model.")
    payload = [
        {"role": "system", "content": [{"type": "input_text", "text": SYSTEM_PROMPT}]},
        {"role": "user", "content": [{"type": "input_text", "text": prompt}]}
    ]
    last_error: Optional[Exception] = None
    for attempt in range(1, max_retries + 1):
        try:
            response = client_obj.responses.create(
                model=model,
                input=payload,
                temperature=0.0,
            )
            return response.output_text
        except Exception as exc:
            last_error = exc
            wait_for = sleep_seconds * (2 ** (attempt - 1))
            print(f"Attempt {attempt} failed: {exc}. Retrying in {wait_for:.1f}s")
            time.sleep(wait_for)
    raise RuntimeError("Failed to retrieve LLM response") from last_error


def parse_llm_json(raw_output: str) -> List[Dict[str, object]]:
    raw_output = raw_output.strip()
    if raw_output.startswith("```") and raw_output.endswith("```"):
        raw_output = re.sub(r"^```[a-zA-Z]*\n|```$", "", raw_output).strip()
    
    # Try to parse as a single JSON array or object first
    try:
        parsed = json.loads(raw_output)
        if isinstance(parsed, dict):
            parsed = [parsed]
    except json.JSONDecodeError:
        # Try wrapping in array brackets in case it's multiple objects
        try:
            parsed = json.loads(f"[{raw_output}]")
        except json.JSONDecodeError:
            # If that fails, use regex to find all JSON objects in the text
            parsed = []
            # Pattern to match complete JSON objects including nested braces
            json_pattern = r'\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\}'
            matches = re.findall(json_pattern, raw_output, re.DOTALL)
            for match in matches:
                try:
                    obj = json.loads(match)
                    if isinstance(obj, dict):
                        parsed.append(obj)
                except json.JSONDecodeError:
                    continue
            
            if not parsed:
                raise ValueError(f"Model returned non-JSON payload: {raw_output[:200]}")
    
    if not isinstance(parsed, list):
        raise ValueError("Expected list of JSON objects from model output")
    
    cleaned: List[Dict[str, object]] = []
    for entry in parsed:
        if not isinstance(entry, dict):
            continue
        required = {"id", "recall_score", "confidence_score", "rationale"}
        if not required.issubset(entry):
            continue
        cleaned.append({key: entry[key] for key in required})
    return cleaned


def enrich_dataframe_with_scores(df: pd.DataFrame, scored_rows: List[Dict[str, object]]) -> pd.DataFrame:
    scored_df = pd.DataFrame(scored_rows).set_index("id")
    merged = df.set_index("id").join(scored_df, how="left")
    return merged.reset_index()

In [36]:
pilot_prompt_text, pilot_missing_keys, pilot_event_metadata = build_batch_prompt(pilot_df, model_events_lookup)

print(f"Prompt length: {len(pilot_prompt_text)} characters across {pilot_prompt_text.count('\n') + 1} lines.")
if pilot_missing_keys:
    print(f"Missing model event keys: {sorted(pilot_missing_keys)}")
else:
    print("All pilot rows resolved to a model event list.")
pilot_event_metadata

Prompt length: 16592 characters across 173 lines.
All pilot rows resolved to a model event list.


Unnamed: 0,id,respondent,title,form_requested,event_form_used,event_count,event_label
0,6,41,Abbot Elementary,Long,short,16,Short form key-moment events (applied to long-form response)
1,90,62,Abbot Elementary,Long,short,16,Short form key-moment events (applied to long-form response)
2,12,85,Abbot Elementary,Long,short,16,Short form key-moment events (applied to long-form response)
3,45,108,Abbot Elementary,Short,short,16,Short form events
4,48,40,Abbot Elementary,Short,short,16,Short form events
5,50,69,Abbot Elementary,Short,short,16,Short form events


In [37]:
print(SYSTEM_PROMPT)

You are an expert qualitative coder focusing on key-moment recall for media research.
Scoring guidelines:
- The MODEL EVENTS describe the short-form key moment. Apply them even when the participant viewed the long-form cut.
- Compare the participant response to short-form key moment events and assess number of events recalled, accuracy, specificity, and ordering.
- Return strictly JSON with the following fields (integers for the scores):
    - "recall_score": 0-100 (0 = no relevant recall, 100 = richly detailed and accurate).
    - "confidence_score": 0-100 reflecting your certainty in the judgment.
    - "rationale": 1-3 sentence explanation referencing the MODEL EVENTS.
- If the response states they do not remember, set recall_score to 0 and confidence_score to at least 90.
- Ignore long-form events or details that fall outside the MODEL EVENTS (they should not raise the score).
- Never invent events beyond the list, and do not include commentary outside the JSON payload.


In [38]:
print(pilot_prompt_text)

Title: Abbot Elementary
        Respondent form: Long
        Event list source: Short form key-moment events (applied to long-form response)
        Question code: Q13
        Row ID: 6

        MODEL EVENTS (chronological):
        1. During lunch in the staff room, the teachers eat pizza together while Gregory pointedly avoids taking a slice.
2. Gregory’s colleagues notice that he is not eating any pizza and become suspicious that something is wrong.
3. Jacob, who previously went out of his way to get the special “Baltimore-style” pizza Gregory claimed to love, places the soggy pizza in front of Gregory.
4. Jacob encourages Gregory to take a slice of the “Baltimore-style” pizza while the rest of the staff watch him closely.
5. As everyone turns to watch him, Gregory panics and finally admits that he does not like pizza at all.
6. Mr. Johnson dramatically drops his mop in shock upon hearing that Gregory does not like pizza.
7. Gregory explains that he invented the story about Baltimo

## LLM Scoring Harness (Run After Approval)
Execute the following helper only after the prompt and pilot sample are approved (Stages 3.3-3.6).

In [46]:
MODEL_NAME = "gpt-4.1"
BATCH_SIZE = 3


def score_with_llm(df: pd.DataFrame, events_lookup: Dict[Tuple[str, str], List[str]], *, client = openai_client, model: str = MODEL_NAME, batch_size: int = BATCH_SIZE) -> Tuple[pd.DataFrame, List[Tuple[str, str]], List[Dict[str, object]]]:
    if df.empty:
        raise ValueError("No rows supplied for scoring.")
    if client is None:
        raise RuntimeError("OpenAI client is not initialised. Configure OPENAI_API_KEY before scoring.")
    all_results: List[Dict[str, object]] = []
    missing_all: set[Tuple[str, str]] = set()
    prompt_artifacts: List[Dict[str, object]] = []
    for batch_index, start in enumerate(range(0, len(df), batch_size), start=1):
        batch_df = df.iloc[start : start + batch_size]
        prompt_text, missing_keys, metadata_df = build_batch_prompt(batch_df, events_lookup)
        prompt_artifacts.append({
            "batch_index": batch_index,
            "prompt": prompt_text,
            "metadata": metadata_df,
        })
        if missing_keys:
            missing_all.update(missing_keys)
        raw_response = call_llm_batch(prompt_text, client_obj=client, model=model)
        batch_results = parse_llm_json(raw_response)
        for entry in batch_results:
            entry["id"] = int(entry["id"])
            entry["recall_score"] = int(entry["recall_score"])
            entry["confidence_score"] = int(entry["confidence_score"])
        all_results.extend(batch_results)
        print(f"✓ Batch {batch_index} completed ({len(batch_results)} rows).")
    scored_df = enrich_dataframe_with_scores(df, all_results)
    return scored_df, sorted(missing_all), prompt_artifacts

In [47]:
# Execute pilot scoring (Step 3.3)
scored_pilot_df, missing_event_keys, prompt_artifacts = score_with_llm(pilot_df, model_events_lookup)
scored_pilot_df.to_csv(PILOT_OUTPUT_PATH, index=False)
print(f"\n✓ Pilot scoring saved to {PILOT_OUTPUT_PATH.relative_to(PROJECT_ROOT)}")
if missing_event_keys:
    print(f"⚠ Missing model event keys encountered: {missing_event_keys}")
else:
    print("✓ All rows scored successfully with no missing event keys.")
    
print(f"\nPilot results preview:")
scored_pilot_df[["id", "respondent", "form", "recall_score", "confidence_score"]].head(10)

✓ Batch 1 completed (3 rows).
✓ Batch 2 completed (3 rows).

✓ Pilot scoring saved to recall_openended\key moment\pilot_outputs\recall_coded_responses_key_moment_pilot.csv
✓ All rows scored successfully with no missing event keys.

Pilot results preview:
✓ Batch 2 completed (3 rows).

✓ Pilot scoring saved to recall_openended\key moment\pilot_outputs\recall_coded_responses_key_moment_pilot.csv
✓ All rows scored successfully with no missing event keys.

Pilot results preview:


Unnamed: 0,id,respondent,form,recall_score,confidence_score
0,6,41,Long,35,95
1,90,62,Long,70,98
2,12,85,Long,40,95
3,45,108,Short,70,95
4,48,40,Short,55,90
5,50,69,Short,35,90
