In [4]:
import json
import pandas as pd

# -----------------------------
# Load data
# -----------------------------
with open("emotions.json", "r") as f:
    EMO = json.load(f)

df = pd.read_json("./input/sample.jsonl", lines=True)

# Allowed aspects/polarities
ASPECTS = list(EMO.keys())
POLARITIES = ["positive", "negative", "neutral"]

# Build emotion lookup by aspect + polarity
allowed_emotions = {
    (aspect, pol): set(EMO[aspect][pol])
    for aspect in EMO
    for pol in EMO[aspect]
}

# -----------------------------
# Validation functions
# -----------------------------
def validate_row(row_idx, row):
    errors = []
    
    # 1. Structure checks
    if not isinstance(row, dict):
        errors.append("Row is not a dict")
        return errors
    
    if "input" not in row or "output" not in row:
        errors.append("Missing 'input' or 'output'")
        return errors
    
    if not isinstance(row["output"], list):
        errors.append("'output' must be a list")
        return errors
    
    # Track duplicates
    seen_pairs = set()
    
    # Validate each annotation
    for ann in row["output"]:
        if not isinstance(ann, dict):
            errors.append("Annotation is not a dict")
            continue
        
        aspect = ann.get("aspect")
        pol = ann.get("polarity")
        emo = ann.get("emotion")
        
        # 2. Aspect validity
        if aspect not in ASPECTS:
            errors.append(f"Invalid aspect: {aspect}")
        
        # 3. Polarity validity
        if pol not in POLARITIES:
            errors.append(f"Invalid polarity: {pol}")
        
        # 4. Duplicate (aspect, polarity)
        key = (aspect, pol)
        if key in seen_pairs:
            errors.append(f"Duplicate annotation for {aspect}/{pol}")
        else:
            seen_pairs.add(key)
        
        # 5. Emotion validation
        if emo is None or emo == "":
            # Neutral emotion may be optional
            if pol != "neutral":
                errors.append(f"Missing emotion for aspect {aspect}")
            continue
        
        if (aspect, pol) in allowed_emotions:
            if emo not in allowed_emotions[(aspect, pol)]:
                errors.append(f"Emotion '{emo}' not allowed for {aspect}/{pol}")
        else:
            errors.append(f"Aspect/polarity combo not found: {aspect}/{pol}")
    
    return errors


# -----------------------------
# Run validation
# -----------------------------
validation_results = {}

for i, row in df.iterrows():
    errs = validate_row(i, row.to_dict())
    if errs:
        validation_results[i] = errs

# -----------------------------
# Summary report
# -----------------------------
print("====== VALIDATION SUMMARY ======")
print(f"Total rows: {len(df)}")
print(f"Rows with issues: {len(validation_results)}")

if len(validation_results) > 0:
    print("\nDetailed issues:")
    for idx, issues in validation_results.items():
        print(f"\nRow {idx}:")
        for e in issues:
            print("  -", e)

Total rows: 51
Rows with issues: 0
