In [None]:
import os

try:
    # Running as normal Python script inside src/
    this_file = os.path.abspath(__file__)
    src_root = os.path.dirname(this_file)                        # EMOTION-PRED/src
    project_root = os.path.dirname(src_root)                    # EMOTION-PRED/
except NameError:
    # Running inside Jupyter (likely src/notebooks or src/)
    cwd = os.getcwd()

    # If running inside src/notebooks â†’ go up one level
    if cwd.endswith("notebooks"):
        src_root = os.path.abspath(os.path.join(cwd, ".."))
        project_root = os.path.dirname(src_root)
    else:
        # Running from project root directly
        project_root = cwd
        src_root = os.path.join(project_root, "src")

# Final unified paths
results_root = os.path.join(src_root, "results")
data_root = os.path.join(src_root, "data","MAMS-ACSA","raw","data_jsonl")
print(f"ðŸ“‚ Project root: {project_root}"
      f"\nðŸ“‚ Source root: {src_root}"
      f"\nðŸ“‚ Results root: {results_root}"
      f"\nðŸ“‚ Data root: {data_root}")
# 3 â€” JSONL files
TRAIN_JSONL = os.path.join(data_root, "train.jsonl")
VAL_JSONL   = os.path.join(data_root, "val.jsonl")
TEST_JSONL  = os.path.join(data_root, "test.jsonl")
SAMPLE_JSONL = os.path.join(data_root, "sample.jsonl")
print("Using dataset directory:", data_root)



In [None]:
import json
import re

INPUT = os.path.join(data_root, "sample_06_12_2025_6pm_annotated.jsonl")
OUTPUT = os.path.join(data_root, "cleaned.jsonl")

# Master allowed emotion list
ALLOWED = {
    "Admiration","Approval","Relaxation","Excitement","Impressed","Indifferent",
    "Satisfaction","Joy","Surprise","Gratitude","Annoyance","Disappointment",
    "Disapproval","Confusion","Fear","Frustration","Relief"
}

# Strings that must become Indifferent
INVALID = {
    None,"null","None","", "mixed emotions","mixed","notdefined",
    "na","n/a","undefined","indiffernet","indiffrent"
}

# Common spelling fixes
SPELLING = {
    "appriciation":"Appreciation",
    "impressed":"Impressed",
    "satisified":"Satisfaction",
    "releif":"Relief",
    "satisfied":"Satisfaction",
    "admiration":"Admiration",
    "gratitude":"Gratitude",
    "appreciation":"Appreciation",
    "indifferent":"Indifferent",
    "joy":"Joy",
    "fear":"Fear",
    "confusion":"Confusion",
    "disappointment":"Disappointment",
    "annoyance":"Annoyance",
    "frustration":"Frustration",
    "disgust":"Disgust"  # if needed
}

def fix_trailing_commas(s):
    s = re.sub(r",(\s*[}\]])", r"\1", s)
    return s

def normalize_emotion(e):
    if e is None:
        return "Indifferent"
    e = str(e).strip()

    if e in INVALID:
        return "Indifferent"

    # lower â†’ capitalized
    e_low = e.lower()
    # fix misspellings
    if e_low in SPELLING:
        e = SPELLING[e_low]

    # unify capitalization
    e = e.capitalize()

    # If not allowed â†’ Indifferent
    if e not in ALLOWED:
        return "Indifferent"

    return e

cleaned = []
errors = []

with open(INPUT, "r", encoding="utf-8") as f:
    for line_no, line in enumerate(f, start=1):
        raw = line.strip()
        if not raw:
            continue

        fixed = fix_trailing_commas(raw)

        try:
            obj = json.loads(fixed)
        except Exception as e:
            errors.append((line_no, raw, str(e)))
            continue

        # Fix emotions in outputs
        if "output" in obj:
            for item in obj["output"]:
                item["emotion"] = normalize_emotion(item.get("emotion"))

        cleaned.append(obj)

# Write clean JSONL
with open(OUTPUT, "w", encoding="utf-8") as f:
    for obj in cleaned:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print("=== DONE CLEANING ===")
print("Valid rows:", len(cleaned))
print("Invalid rows skipped:", len(errors))

if errors:
    print("\nErrors found in rows:")
    for ln, raw, err in errors:
        print(f"Line {ln}: {err}")