# Annotating ISEAR Dataset using Deepseek v3.2
the SOTA llm will relabel existing labels and provide an extra VAT labeling

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Install dependencies and imports

In [None]:

# 1. INSTALL DEPENDENCIES
!pip install openai pydantic pandas tqdm -q

import os
import pandas as pd
import json
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Dict, Optional
from tqdm.notebook import tqdm
from pathlib import Path


## Configuration and API keys

In [None]:
OPENROUTER_API_KEY = "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" # @param {type:"string"}
ISEAR_CSV_PATH = "/content/drive/MyDrive/VibeQ-EIE/data/ISEAR_dataset_complete.csv" # @param {type:"string"}
# Using Qwen 3 for creativity and DeepSeek V3.2 for logic
MODEL_LABELER = "deepseek/deepseek-v3.2"

# Make sure OUTPUT_DIR exists (you already have this somewhere above)
OUTPUT_DIR = Path("/content/drive/MyDrive/VibeQ-EIE/llmdata")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TARGET_EMOTIONS = [
    "anger", "anticipation", "caring", "disgust", "fear",
    "joy", "neutral", "sadness", "surprise"
]


client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

## Relabeling prompt worker

In [None]:

# ==========================================
# 🧠 AI WORKER : ISEAR RE-LABELER
# ==========================================
def process_isear_row(text, original_label):
    """
    Takes ISEAR text + old label (e.g. 'shame') -> Outputs NEW label (from Target 9) + VAD.
    """
    SYSTEM_PROMPT = """
You are an expert Psychologist and Data Annotator specialized in Emotion Dynamics.
Your task is to analyze a first-person journal entry and produce a high-precision semantic analysis in STRICT JSON format.

────────────────────────────────────────
1. TARGET EMOTIONS & DEFINITIONS
────────────────────────────────────────
Classify using ONLY these 9 labels. Adhere to these definitions to avoid overlap:
- anger: Frustration, rage, irritation, or hostility.
- anticipation: Expectancy, planning, or looking forward (positive or negative).
- caring: Tenderness, empathy, concern for others, or love.
- disgust: Revulsion, rejection, or physical/moral sickness.
- fear: Anxiety, worry, terror, or sense of danger.
- joy: Happiness, pleasure, relief, or satisfaction.
- neutral: Lack of strong emotion, factual statement, or numbness.
- sadness: Grief, loss, despair, or melancholy.
- surprise: Shock, astonishment, or unexpected realization.

────────────────────────────────────────
2. VAD SCALE GUIDELINES (Mehrabian Scale)
────────────────────────────────────────
- Valence: -1.0 (Agony/Deep Negativity) to 1.0 (Ecstasy/Deep Positivity).
- Arousal: 0.0 (Comatose/Sleepy) to 1.0 (Frenzied/Panic/High Energy).
- Dominance: 0.0 (Submissive/Overwhelmed/Helpless) to 1.0 (In Control/Empowered/Dominant).

────────────────────────────────────────
3. ANNOTATION STRATEGY (CRITICAL)
────────────────────────────────────────
- DETECT SUBTEXT: Look for sarcasm, passive-aggression, or hidden feelings (e.g., "I'm fine" might be Sadness, not Neutral).
- SCORE DISTRIBUTION: 'all_emotions' is a multi-label regression. If the text is mixed (e.g., Bittersweet),
 assign high scores to BOTH Joy and Sadness.
- PRIMARY EMOTION: Must be the single highest score in 'all_emotions'.

────────────────────────────────────────
4. OUTPUT SCHEMA
────────────────────────────────────────
You must output a single JSON object.
INCLUDE a "reasoning" field to explain your psychological analysis before the metrics.

{
  "reasoning": "Brief analysis of the writer's mental state, citing specific words from the text.",
  "primary_emotion": "One of the 9 labels",
  "secondary_emotions": ["List 0-2 labels"],
  "vad": {
    "valence": float,
    "arousal": float,
    "dominance": float
  },
  "all_emotions": {
    "anger": 0.0-1.0,
    "anticipation": 0.0-1.0,
    "caring": 0.0-1.0,
    "disgust": 0.0-1.0,
    "fear": 0.0-1.0,
    "joy": 0.0-1.0,
    "neutral": 0.0-1.0,
    "sadness": 0.0-1.0,
    "surprise": 0.0-1.0
  }
}

────────────────────────────────────────
CONSTRAINTS
────────────────────────────────────────
- The output must be valid parsable JSON.
- No markdown formatting (no ```json blocks).
- Ensure 'primary_emotion' has the highest float value in 'all_emotions'.
"""

    try:
        response = client.chat.completions.create(
            model=MODEL_LABELER,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Text: {text}\nOld Label: {original_label}"}
            ],
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"⚠️ ISEAR Error: {e}")
        return None


## Batch processing loop

In [None]:
# 1. PROCESS ISEAR (With Retries, 100-Row Chunking & Crash Protection)
# ------------------------------------------
import time
import random

# Define the output path explicitly
output_csv_path = OUTPUT_DIR / "isear_relabelled_9emotions.csv"

# --- 🛡️ HELPER: ROBUST ROW PROCESSOR ---
def safe_process_row_with_retries(text, label, max_retries=3):
    """
    1. Tries to process a row.
    2. If API fails (429), it waits and tries again.
    3. If it hits a permanent error, it returns None (DOES NOT CRASH).
    """
    for attempt in range(max_retries):
        try:
            # Call your existing function
            data = process_isear_row(text, label)

            # VALIDATION: Check if the critical keys exist
            if data and 'vad' in data and 'all_emotions' in data:
                return data

            # If data is bad/empty, treat as a "soft fail"
            if data:
                print(f"   ⚠️ Row returned JSON but missing keys. Skipping.")
                return None

        except Exception as e:
            # HANDLE RATE LIMITS (Don't Crash, just Sleep)
            error_str = str(e).lower()
            if "429" in error_str or "rate limit" in error_str:
                wait_time = (attempt + 1) * 10 + random.uniform(0, 5)
                print(f"   ⏳ Hit Rate Limit (429). Sleeping {wait_time:.1f}s...")
                time.sleep(wait_time)
            else:
                # HANDLE OTHER ERRORS (Don't Crash, just Log)
                print(f"   ❌ Unexpected Error: {e}")
                return None

    print(f"   ❌ Failed after {max_retries} retries.")
    return None

# --- MAIN LOOP ---

if os.path.exists(ISEAR_CSV_PATH):
    # 1. Load Data
    df_isear_full = pd.read_csv(ISEAR_CSV_PATH)
    df_target = df_isear_full.head(3000)

    # 2. Resume Logic
    results = []
    start_index = 0

    if os.path.exists(output_csv_path):
        print(f"🔄 Found existing file. Resuming...")
        try:
            df_existing = pd.read_csv(output_csv_path)
            results = df_existing.to_dict('records')
            start_index = len(df_existing)
            print(f"   Skipping first {start_index} rows.")
        except:
            start_index = 0

    # 3. Slice Remaining Rows
    df_remaining = df_target.iloc[start_index:]
    print(f"--- Processing {len(df_remaining)} remaining rows ---")

    # 4. Processing Loop
    for i, row in tqdm(df_remaining.iterrows(), total=len(df_remaining)):

        # Safe Run
        data = safe_process_row_with_retries(row.get('content'), row.get('emotion'))

        if data:
            vad_obj = data.get('vad', {})
            results.append({
                "original_text": row.get('content'),
                "old_label": row.get('emotion'),
                "new_primary_emotion": data.get('primary_emotion'),
                "valence": vad_obj.get('valence'),
                "arousal": vad_obj.get('arousal'),
                "dominance": vad_obj.get('dominance'),
                "all_emotions": json.dumps(data.get('all_emotions', {}))
            })

        # 5. CHUNKING (Every 100 Rows)
        # This saves your progress so you never lose more than a few minutes of work.
        if len(results) % 100 == 0:
            pd.DataFrame(results).to_csv(output_csv_path, index=False)

    # 6. Final Save
    pd.DataFrame(results).to_csv(output_csv_path, index=False)
    print(f"✅ ISEAR Complete. Saved to {output_csv_path}")

🔄 Found existing file. Resuming...
   Skipping first 3000 rows.
--- Processing 0 remaining rows ---


0it [00:00, ?it/s]

✅ ISEAR Complete. Saved to /content/drive/MyDrive/VibeQ-EIE/llmdata/isear_relabelled_9emotions.csv
