# Relabel 2000 raws of ISEAR using Qwen 235B
generating from scratch with emotions distribution to apply Teacher-Student training style on the base model

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports and setup

In [2]:
import os, json, time, re
from pathlib import Path
from typing import Dict, Any, Optional, Tuple
from tqdm.auto import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
from openai import OpenAI

OPENROUTER_API_KEY = "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" # @param {type:"string"}

GEN_MODEL = "qwen/qwen3-235b-a22b-2507"

BASE_DIR = Path("/content/drive/MyDrive/VibeQ-EIE/llmdata")
BASE_DIR.mkdir(parents=True, exist_ok=True)

TARGET_EMOTIONS = [
    "anger", "anticipation", "caring", "disgust", "fear",
    "joy", "neutral", "sadness", "surprise"
]

TARGET_PER_EMOTION = 200
TOTAL_TARGET = TARGET_PER_EMOTION * len(TARGET_EMOTIONS)

ISEAR_OUT_JSONL = BASE_DIR / "teacher_isear_distillation.jsonl"
ISEAR_REJECTS  = BASE_DIR / "teacher_isear_distillation.rejects.jsonl"


client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)


## Load ISEAR data subset

In [3]:
import pandas as pd
ISEAR_PATH = "/content/drive/MyDrive/VibeQ-EIE/data/ISEAR_dataset_complete.csv" # @param {type:"string"}

# Expected columns: emotion, content
isear_df = pd.read_csv(ISEAR_PATH)

assert "emotion" in isear_df.columns
assert "content" in isear_df.columns

print("ISEAR size:", len(isear_df))
isear_df.head()

ISEAR size: 7516


Unnamed: 0,emotion,content,Unnamed: 2
0,joy,On days when I feel close to my partner and ot...,
1,fear,Every time I imagine that someone I love or I ...,
2,anger,When I had been obviously unjustly treated and...,
3,sadness,When I think about the short time that we live...,
4,disgust,At a gathering I found myself involuntarily si...,


## Sampling configuration

In [4]:
MAX_ISEAR_ROWS = 2000

# Shuffle deterministically, then take first 2000
isear_df = isear_df.sample(
    n=min(MAX_ISEAR_ROWS, len(isear_df)),
    random_state=42
).reset_index(drop=True)

print("Using ISEAR rows:", len(isear_df))


Using ISEAR rows: 2000


## System prompt

In [5]:
SYSTEM_PROMPT = """You are an expert clinical psychologist and affective computing researcher.
Your task is to generate realistic first-person journaling entries and provide
a calibrated emotional analysis suitable for training a machine learning classifier.
You MUST follow the output schema exactly.
"""

ISEAR_USER_PROMPT_TEMPLATE = """
You are given a first-person emotional journal entry.

TASK:
Analyze the text and output a calibrated emotional analysis suitable for
training a machine learning classifier.

Constraints:
- Do NOT rewrite or alter the text.
- Base your analysis ONLY on the given content.
- Avoid bias toward sadness unless clearly expressed.

TEXT:
\"\"\"
__TEXT__
\"\"\"

Now output JSON that follows this EXACT schema and keys (no extra wrapper objects):
{
  "text": "original text, unchanged",
  "primary_emotion": "one of: anger|anticipation|caring|disgust|fear|joy|neutral|sadness|surprise",
  "secondary_emotions": ["0-2 items, each one from the same allowed list"],
  "teacher_emotion_probs": {
    "anger": 0.0, "anticipation": 0.0, "caring": 0.0, "disgust": 0.0, "fear": 0.0,
    "joy": 0.0, "neutral": 0.0, "sadness": 0.0, "surprise": 0.0
  },
  "vad": { "valence": 0.0, "arousal": 0.0, "dominance": 0.0 },
  "teacher_confidence": 0.0
}

Rules:
- teacher_emotion_probs must include ALL 9 keys exactly and sum to 1.0
- primary_emotion must have the highest probability
- valence in [-1,1], arousal/dominance in [0,1]
- Return ONLY JSON (no markdown, no commentary).
""".strip()

def build_isear_prompt(text: str) -> str:
    return ISEAR_USER_PROMPT_TEMPLATE.replace("__TEXT__", text.strip())



## JSON extraction helper

In [6]:
def extract_json_object(s: str) -> str:
    s = (s or "").strip()
    if s.startswith("{") and s.endswith("}"):
        return s
    m = re.search(r"\{.*\}", s, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found")
    return m.group(0)

def log_bad(target: str, raw: str):
    with DEBUG_BAD.open("a", encoding="utf-8") as f:
        f.write("\n" + "="*80 + "\n")
        f.write(f"TIME: {time.time()} | TARGET: {target}\n")
        f.write((raw or "")[:4000] + "\n")

def renormalize_probs(vals: Dict[str, float]) -> Dict[str, float]:
    s = sum(vals.values())
    if s <= 0:
        u = 1.0 / len(TARGET_EMOTIONS)
        return {k: u for k in TARGET_EMOTIONS}
    return {k: v / s for k, v in vals.items()}

def fix_probs(p: Dict[str, Any]) -> Dict[str, float]:
    # keep only allowed keys; ignore junk keys like "sad irresponsibility"
    out = {k: float(p.get(k, 0.0)) for k in TARGET_EMOTIONS}
    for k in out:
        out[k] = max(0.0, min(1.0, out[k]))
    return renormalize_probs(out)

def normalize_row(row: Dict[str, Any]) -> Dict[str, Any]:
    if not isinstance(row, dict):
        return {}

    # unwrap analysis wrapper
    if "analysis" in row and isinstance(row["analysis"], dict):
        merged = {}
        for k in ["text", "journal", "journal_entry", "entry"]:
            if isinstance(row.get(k), str):
                merged["text"] = row[k]
                break
        merged.update(row["analysis"])
        row = merged

    # primary_emotion key variants (e.g., "primary_em游戏副本")
    if "primary_emotion" not in row:
        for k in list(row.keys()):
            if isinstance(k, str) and k.startswith("primary_em"):
                row["primary_emotion"] = row.get(k)
                break

    # text/journal key variants -> canonical text
    if not isinstance(row.get("text"), str):
        for k in ["journal", "journal_entry", "entry"]:
            if isinstance(row.get(k), str):
                row["text"] = row[k]
                break

    # probs key variants
    if "teacher_emotion_probs" not in row:
        for k in ["teacher_emt_probs", "teacher_probs", "emotion_probs"]:
            if isinstance(row.get(k), dict):
                row["teacher_emotion_probs"] = row[k]
                break

    # vad key variants
    if not isinstance(row.get("vad"), dict):
        if isinstance(row.get("vad_scores"), dict):
            row["vad"] = row["vad_scores"]

    # lowercase primary
    if isinstance(row.get("primary_emotion"), str):
        row["primary_emotion"] = row["primary_emotion"].strip().lower()

    # clean secondary emotions
    sec = row.get("secondary_emotions", [])
    cleaned = []
    if isinstance(sec, list):
        for x in sec:
            if isinstance(x, str):
                x = x.strip().lower()
                if x in TARGET_EMOTIONS and x not in cleaned:
                    cleaned.append(x)
            if len(cleaned) == 2:
                break
    row["secondary_emotions"] = cleaned

    # fix probs keys + normalize
    if isinstance(row.get("teacher_emotion_probs"), dict):
        row["teacher_emotion_probs"] = fix_probs(row["teacher_emotion_probs"])

    # remove extras
    for k in ["analysis", "journal", "journal_entry", "entry", "vad_scores",
              "teacher_emt_probs", "teacher_probs", "emotion_probs"]:
        row.pop(k, None)

    return row


def valid_row(row: Dict[str, Any]) -> Tuple[bool, str]:
    # required keys
    for k in ["text", "primary_emotion", "secondary_emotions", "teacher_emotion_probs", "vad", "teacher_confidence"]:
        if k not in row:
            return False, f"missing_key:{k}"

    if not isinstance(row["text"], str) or len(row["text"].strip()) < 10:
        return False, "bad_text"

    pe = row["primary_emotion"]
    if pe not in TARGET_EMOTIONS:
        return False, "bad_primary"

    sec = row["secondary_emotions"]
    if not isinstance(sec, list) or len(sec) > 2:
        return False, "bad_secondary_type_or_len"
    if not all(isinstance(x, str) and x in TARGET_EMOTIONS for x in sec):
        return False, "bad_secondary_values"

    probs = row["teacher_emotion_probs"]
    if not isinstance(probs, dict) or set(probs.keys()) != set(TARGET_EMOTIONS):
        return False, "bad_probs_keys"
    s = float(sum(float(v) for v in probs.values()))
    if abs(s - 1.0) > 1e-6:
        return False, "bad_probs_sum"
    if any((float(v) < 0.0 or float(v) > 1.0) for v in probs.values()):
        return False, "bad_probs_range"

    # primary highest
    p_primary = float(probs[pe])
    if p_primary + 1e-9 < max(float(v) for v in probs.values()):
        return False, "primary_not_max"

    vad = row["vad"]
    if not isinstance(vad, dict):
        return False, "bad_vad_type"
    for k, lo, hi in [("valence", -1, 1), ("arousal", 0, 1), ("dominance", 0, 1)]:
        if k not in vad or not isinstance(vad[k], (int, float)):
            return False, f"bad_vad:{k}"
        if not (lo <= float(vad[k]) <= hi):
            return False, f"vad_out_of_range:{k}"

    conf = row["teacher_confidence"]
    if not isinstance(conf, (int, float)) or not (0.0 <= float(conf) <= 1.0):
        return False, "bad_conf"

    return True, "ok"


## LLM call with retries

In [7]:
@retry(stop=stop_after_attempt(5), wait=wait_exponential_jitter(initial=1, max=20))
def call_model(system: str, user: str, temperature: float = 0.8) -> str:
    resp = client.chat.completions.create(
        model=GEN_MODEL,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
    )
    return (resp.choices[0].message.content or "").strip()

def repair_json_with_model(bad_text: str) -> Dict[str, Any]:
    # strict rewrite to canonical schema
    repair_system = "You are a strict JSON rewriter. Return ONLY valid JSON."
    repair_user = f"""
Rewrite the following into ONE valid JSON object with EXACT keys:
text, primary_emotion, secondary_emotions, teacher_emotion_probs, vad, teacher_confidence.

Constraints:
- primary_emotion and secondary_emotions values must be only from {TARGET_EMOTIONS}
- teacher_emotion_probs must have EXACTLY these keys {TARGET_EMOTIONS} and sum to 1.0
- vad = {{valence [-1,1], arousal [0,1], dominance [0,1]}}
- teacher_confidence [0,1]
Return ONLY JSON. No markdown.

CONTENT:
{bad_text}
""".strip()
    fixed = call_model(repair_system, repair_user, temperature=0.0)
    return json.loads(extract_json_object(fixed))

from tqdm.auto import tqdm

def relabel_isear_row(row):
    text = row["content"]
    source_emotion = row["emotion"]

    prompt = build_isear_prompt(text)
    raw = call_model(SYSTEM_PROMPT, prompt, temperature=0.8)

    try:
        parsed = json.loads(extract_json_object(raw))
    except Exception:
        parsed = repair_json_with_model(raw)

    parsed = normalize_row(parsed)
    ok, reason = valid_row(parsed)
    if not ok:
        raise ValueError(f"ISEAR row invalid: {reason}")

    # Attach original ISEAR emotion as metadata only
    parsed["source_emotion"] = source_emotion.lower()
    parsed["source_dataset"] = "ISEAR"

    return parsed



## Track label counts

In [9]:
def load_counts_from_jsonl(path: Path) -> Dict[str, int]:
    counts = {e: 0 for e in TARGET_EMOTIONS}
    if not path.exists():
        return counts
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                row = json.loads(line)
                ok, _ = valid_row(row)
                if ok and row.get("primary_emotion") in counts:
                    counts[row["primary_emotion"]] += 1
            except Exception:
                continue
    return counts


## Generate and write outputs

In [10]:
accepted, rejected = 0, 0

with ISEAR_OUT_JSONL.open("w", encoding="utf-8") as fout, \
     ISEAR_REJECTS.open("w", encoding="utf-8") as frej:

    for _, row in tqdm(isear_df.iterrows(), total=len(isear_df), desc="Relabeling ISEAR"):
        try:
            labeled = relabel_isear_row(row)
            fout.write(json.dumps(labeled, ensure_ascii=False) + "\n")
            accepted += 1
            time.sleep(0.25)
        except Exception as e:
            rejected += 1
            frej.write(json.dumps({
                "error": str(e),
                "emotion": row["emotion"],
                "content": row["content"][:500]
            }, ensure_ascii=False) + "\n")
            time.sleep(1.0)

print("ISEAR relabeling complete")
print("Accepted:", accepted)
print("Rejected:", rejected)
print("Output:", ISEAR_OUT_JSONL)


Relabeling ISEAR:   0%|          | 0/2000 [00:00<?, ?it/s]

ISEAR relabeling complete
Accepted: 1963
Rejected: 37
Output: /content/drive/MyDrive/VibeQ-EIE/llmdata/teacher_isear_distillation.jsonl
