In [7]:
import os

try:
    # Running as normal Python script inside src/
    this_file = os.path.abspath(__file__)
    src_root = os.path.dirname(this_file)                        # EMOTION-PRED/src
    project_root = os.path.dirname(src_root)                    # EMOTION-PRED/
except NameError:
    # Running inside Jupyter (likely src/notebooks or src/)
    cwd = os.getcwd()

    # If running inside src/notebooks â†’ go up one level
    if cwd.endswith("notebooks"):
        src_root = os.path.abspath(os.path.join(cwd, ".."))
        project_root = os.path.dirname(src_root)
    else:
        # Running from project root directly
        project_root = cwd
        src_root = os.path.join(project_root, "src")

# Final unified paths
results_root = os.path.join(src_root, "results")
data_root = os.path.join(src_root, "data","MAMS-ACSA","raw","data_jsonl")
print(f"ðŸ“‚ Project root: {project_root}"
      f"\nðŸ“‚ Source root: {src_root}"
      f"\nðŸ“‚ Results root: {results_root}"
      f"\nðŸ“‚ Data root: {data_root}")
# 3 â€” JSONL files
TRAIN_JSONL = os.path.join(data_root, "train.jsonl")
VAL_JSONL   = os.path.join(data_root, "val.jsonl")
TEST_JSONL  = os.path.join(data_root, "test.jsonl")
SAMPLE_JSONL = os.path.join(data_root, "sample.jsonl")
print("Using dataset directory:", data_root)



ðŸ“‚ Project root: /Users/hd/Desktop/EMOTION-PRED
ðŸ“‚ Source root: /Users/hd/Desktop/EMOTION-PRED/src
ðŸ“‚ Results root: /Users/hd/Desktop/EMOTION-PRED/src/results
ðŸ“‚ Data root: /Users/hd/Desktop/EMOTION-PRED/src/data/MAMS-ACSA/raw/data_jsonl
Using dataset directory: /Users/hd/Desktop/EMOTION-PRED/src/data/MAMS-ACSA/raw/data_jsonl


In [8]:
# import json
# import os
# import requests
# from dotenv import load_dotenv

# # -----------------------------
# # Load API key
# # -----------------------------
# load_dotenv()
# API_KEY = os.getenv("GEMINI_API_KEY")

# MODEL = "models/gemini-2.5-flash"
# URL = f"https://generativelanguage.googleapis.com/v1beta/{MODEL}:generateContent"

# HEADERS = {
#     "Content-Type": "application/json",
#     "X-goog-api-key": API_KEY
# }

# # -----------------------------
# # Paths
# # -----------------------------
# IN_PATH = os.path.join(data_root, "sample_06_12_2025_6pm_annotated.jsonl")
# EMOTION_JSON = os.path.join(data_root, "emotion.json")

# OUT_DIR = "output"
# OUT_PATH = os.path.join(OUT_DIR, "gemini_annotated.jsonl")
# os.makedirs(OUT_DIR, exist_ok=True)

# # -----------------------------
# # Load data + emotion lists
# # -----------------------------
# data = [json.loads(line) for line in open(IN_PATH, "r", encoding="utf-8")]
# EMOTIONS = json.load(open(EMOTION_JSON, "r", encoding="utf-8"))

# # Build lookup table
# allowed_lookup = {
#     (aspect, polarity): set(EMOTIONS[aspect][polarity])
#     for aspect in EMOTIONS
#     for polarity in EMOTIONS[aspect]
# }

# # -----------------------------
# # Ask Gemini helper
# # -----------------------------
# def ask_gemini(prompt):
#     payload = {
#         "contents": [{"parts": [{"text": prompt}]}]
#     }
#     r = requests.post(URL, headers=HEADERS, json=payload)
#     r.raise_for_status()
#     return r.json()["candidates"][0]["content"]["parts"][0]["text"].strip()

# # -----------------------------
# # Get emotion
# # -----------------------------
# def get_emotion(review, aspect, polarity):
#     allowed = allowed_lookup[(aspect, polarity)]

#     prompt = f"""
# Choose the emotion toward the given aspect.

# Allowed emotions: {list(allowed)}

# Rules:
# - Respond with EXACTLY ONE WORD from the allowed emotions.
# - Do NOT invent new emotions.
# - Do NOT output sentiment words.

# Review: "{review}"
# Aspect: "{aspect}"
# Polarity: "{polarity}"

# Return ONLY the emotion word.
# """

#     resp = ask_gemini(prompt)
#     resp = resp.replace(".", "").replace(",", "").title()

#     if resp in allowed:
#         return resp
#     else:
#         # fallback: first allowed emotion
#         return list(allowed)[0]

# # -----------------------------
# # Annotate file
# # -----------------------------
# for row in data:
#     text = row["input"]
#     for item in row["output"]:
#         item["emotion"] = get_emotion(text, item["aspect"], item["polarity"])

# # -----------------------------
# # Save output
# # -----------------------------
# with open(OUT_PATH, "w", encoding="utf-8") as f:
#     for row in data:
#         f.write(json.dumps(row, ensure_ascii=False) + "\n")

# print("DONE â†’", OUT_PATH)

In [9]:
import json
import os
import requests
from dotenv import load_dotenv

# -----------------------------
# Load API key
# -----------------------------
load_dotenv()
API_KEY = os.getenv("GEMINI_API_KEY")

MODEL = "models/gemini-2.5-flash"
URL = f"https://generativelanguage.googleapis.com/v1beta/{MODEL}:generateContent"

HEADERS = {
    "Content-Type": "application/json",
    "X-goog-api-key": API_KEY
}

# -----------------------------
# Paths
# -----------------------------
IN_PATH = os.path.join(data_root, "cleaned.jsonl")
EMOTION_JSON = os.path.join(data_root, "emotion.json")

OUT_DIR = "output"
OUT_PATH = os.path.join(OUT_DIR, "gemini_annotated_aspect_polarity_emotions_200.jsonl")
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Load emotion taxonomy
# -----------------------------
EMOTIONS = json.load(open(EMOTION_JSON, "r", encoding="utf-8"))

ASPECTS = list(EMOTIONS.keys())
POLARITIES = ["positive", "negative", "neutral"]

allowed_lookup = {
    (aspect, polarity): EMOTIONS[aspect][polarity]
    for aspect in EMOTIONS
    for polarity in EMOTIONS[aspect]
}

# -----------------------------
# Gemini call
# -----------------------------
def ask_gemini(prompt):
    payload = {"contents": [{"parts": [{"text": prompt}]}]}
    r = requests.post(URL, headers=HEADERS, json=payload)
    r.raise_for_status()
    return r.json()["candidates"][0]["content"]["parts"][0]["text"].strip()


# -----------------------------
# JSON-safe parsing (2-stage repair)
# -----------------------------
def safe_json_parse(txt):
    # Try direct
    try:
        return json.loads(txt)
    except:
        pass

    # Try removing code fences
    cleaned = txt.replace("```json", "").replace("```", "").strip()
    try:
        return json.loads(cleaned)
    except:
        pass

    # Try removing trailing commas
    cleaned = cleaned.replace(",]", "]").replace(",}", "}")
    try:
        return json.loads(cleaned)
    except:
        return None


# -----------------------------
# Full ABSA extraction with retry
# -----------------------------
def annotate_full(review):
    prompt = f"""
Extract aspects, polarity, and emotion.

### Allowed aspects:
{ASPECTS}

### Allowed polarities:
{POLARITIES}

### Allowed emotions:
{json.dumps(EMOTIONS, indent=2)}

### Output format (STRICT):
[
  {{"aspect": "...", "polarity": "...", "emotion": "..."}},
  ...
]

### Rules:
- JSON only.
- No explanations.
- Emotion must be from the allowed list for that aspect/polarity.
- If no aspects â†’ return [].

### Review:
"{review}"

Return ONLY a JSON array.
"""

    # Retry up to 3 times
    for _ in range(3):
        response = ask_gemini(prompt)
        parsed = safe_json_parse(response)

        if isinstance(parsed, list):
            break

    # Final fallback
    if not isinstance(parsed, list):
        print("JSON ERROR â†’", response)
        return []

    # Validate and fix emotions
    final = []
    for item in parsed:
        asp = item.get("aspect")
        pol = item.get("polarity")
        emo = item.get("emotion", "").title()

        if (asp, pol) not in allowed_lookup:
            continue

        allowed = allowed_lookup[(asp, pol)]

        if emo not in allowed:
            emo = allowed[0]  # fallback

        final.append({
            "aspect": asp,
            "polarity": pol,
            "emotion": emo
        })

    return final


# -----------------------------
# Load input
# -----------------------------
raw_data = [
    json.loads(line)
    for line in open(IN_PATH, "r", encoding="utf-8")
]



# -----------------------------
# Annotate all reviews
# -----------------------------
results = []

for row in raw_data:
    review = row["input"]
    triples = annotate_full(review)

    results.append({
        "input": review,
        "output": triples
    })

# -----------------------------
# Save output
# -----------------------------
with open(OUT_PATH, "w", encoding="utf-8") as f:
    for r in results:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print("DONE â†’", OUT_PATH)

DONE â†’ output/gemini_annotated_aspect_polarity_emotions_200.jsonl
