Lets use MAMS dataset

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os

try:
    # Running as normal Python script inside src/
    this_file = os.path.abspath(__file__)
    src_root = os.path.dirname(this_file)                        # EMOTION-PRED/src
    project_root = os.path.dirname(src_root)                    # EMOTION-PRED/
except NameError:
    # Running inside Jupyter (likely src/notebooks or src/)
    cwd = os.getcwd()

    # If running inside src/notebooks â†’ go up one level
    if cwd.endswith("notebooks"):
        src_root = os.path.abspath(os.path.join(cwd, ".."))
        project_root = os.path.dirname(src_root)
    else:
        # Running from project root directly
        project_root = cwd
        src_root = os.path.join(project_root, "src")

# Final unified paths
results_root = os.path.join(src_root, "results")
data_root = os.path.join(src_root, "data","MAMS-ACSA","raw","data_jsonl")
print(f"ðŸ“‚ Project root: {project_root}"
      f"\nðŸ“‚ Source root: {src_root}"
      f"\nðŸ“‚ Results root: {results_root}"
      f"\nðŸ“‚ Data root: {data_root}")

In [None]:
import os
import json
import pandas as pd

# Must already exist from your unified path resolver
# data_root = os.path.join(src_root, "data", "MAMS-ACSA", "raw", "data_jsonl")

dataset_name = "MAMS-ACSA"

# ----------------------------------------------------------
# 1. Paths
# ----------------------------------------------------------
input_path = os.path.join(data_root, "train.jsonl")
output_csv = os.path.join(data_root, f"{dataset_name.lower()}_train_full.csv")

# ----------------------------------------------------------
# 2. Skip if already built
# ----------------------------------------------------------
if os.path.exists(output_csv):
    print(f"Using existing file: {output_csv}")
    print(pd.read_csv(output_csv).head())
    raise SystemExit()

print("Building mams_train_full.csv ...")

# ----------------------------------------------------------
# 3. Read JSONL â†’ flatten aspects
# ----------------------------------------------------------
records = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        row = json.loads(line)
        sentence = row["input"]

        for item in row["output"]:
            records.append({
                "sentence": sentence,
                "aspect_term": item.get("aspect", ""),
                "polarity": item.get("polarity", ""),
                "emotion": item.get("emotion", None),
            })

df = pd.DataFrame(records)

# ----------------------------------------------------------
# 4. create row_id
# ----------------------------------------------------------
df["row_id"] = df.index

# ----------------------------------------------------------
# 5. reorder + save
# ----------------------------------------------------------
df = df[["row_id", "sentence", "aspect_term", "polarity", "emotion"]]
df.to_csv(output_csv, index=False)

print(f"âœ” Saved: {output_csv}")
print(df.head())

In [None]:
import sys, os

# Add src/ to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [None]:
import os
import time
from emotion_pipeline_optimized import run_full_emotion_pipeline

RESULT_DIR = os.path.join(results_root, f"emotion_{dataset_name}")
os.makedirs(RESULT_DIR, exist_ok=True)

# Check existing CSVs
csvs = [f for f in os.listdir(RESULT_DIR) if f.endswith(".csv")]

if not csvs:
    t0 = time.time()
    run_full_emotion_pipeline(
        input_csv=output_csv,        # FIXED
        dataset_name=dataset_name,
        results_root=results_root,
    )
    print("Pipeline run:", round(time.time() - t0, 2), "s")
else:
    print("Pipeline skipped (existing CSVs detected).")
    print("Existing CSVs:", csvs)

In [None]:
import os
import pandas as pd

RESULT_DIR = os.path.join(results_root, f"emotion_{dataset_name}")  # or full dataset
csv_files = [f for f in os.listdir(RESULT_DIR) if f.endswith(".csv")]

MODEL_NAME_MAP = {
    "j_hartmann_emotion_english_roberta_large": "roberta_large",
    "nateraw_bert_base_uncased_emotion": "bert_base_emotion",
    "j_hartmann_emotion_english_distilroberta_base": "roberta_emotion",
    "joeddav_distilbert_base_uncased_go_emotions_student": "go_emotions",
    "cardiffnlp_twitter_roberta_base_emotion": "tweet_eval",
    "mrm8488_t5_base_finetuned_emotion": "t5_emotion",
    "SamLowe_roberta_base_go_emotions": "go_emotions_roberta",
}

merged = None

for f in csv_files:
    full_path = os.path.join(RESULT_DIR, f)
    df = pd.read_csv(full_path)

    # get model name
    base = f.replace("_annotated.csv", "")
    colname = MODEL_NAME_MAP.get(base, base)

    # keep only row_id + emotion_auto
    df = df[["row_id", "emotion_auto"]].rename(columns={
        "emotion_auto": colname
    })

    # first file initializes
    if merged is None:
        merged = df
    else:
        merged = merged.merge(df, on="row_id", how="inner")

pd.set_option("display.max_colwidth", None)
print(merged.head(5))

In [None]:
import pandas as pd

# 1) columns to use for voting
model_cols = [
    "roberta_large",
    "bert_base_emotion",
    "roberta_emotion",
    "go_emotions",
    "tweet_eval",
    "t5_emotion",
    "go_emotions_roberta",
]

# 2) majority vote function
def get_majority_emotion(row):
    votes = []

    for col in model_cols:
        if col in row and pd.notna(row[col]):
            votes.append(row[col])

    if len(votes) == 0:
        return None

    counts = {}
    for emo in votes:
        counts[emo] = counts.get(emo, 0) + 1

    majority_emotion = max(counts.items(), key=lambda x: x[1])[0]
    return majority_emotion

# 3) apply to all rows
merged["consensus_emotion"] = merged.apply(get_majority_emotion, axis=1)

# ----------------------------------------------------------
# 4) Add consensus_count = number of model votes
# ----------------------------------------------------------
def get_consensus_count(row):
    emo = row["consensus_emotion"]
    if pd.isna(emo):
        return 0

    return sum(1 for col in model_cols if row[col] == emo)

merged["consensus_count"] = merged.apply(get_consensus_count, axis=1)

# 5) quick check
merged[["row_id"] + model_cols + ["consensus_emotion", "consensus_count"]].head(10)

In [None]:
consensus_counts = merged["consensus_emotion"].value_counts()
print("=== Consensus Emotion Distribution ===")
print(consensus_counts)

consensus_percent = (merged["consensus_emotion"].value_counts(normalize=True) * 100).round(2)
print("=== Consensus Emotion Distribution (%) ===")
print(consensus_percent)

consensus_stats = pd.DataFrame({
    "count": merged["consensus_emotion"].value_counts(),
    "percent": (merged["consensus_emotion"].value_counts(normalize=True) * 100).round(2)
})

print("=== Consensus Statistics ===")
consensus_stats

strength_counts = merged["consensus_count"].value_counts().sort_index()
print("=== Consensus Strength (How many models agreed) ===")
print(strength_counts)

In [None]:
print("=== Per-model emotion counts ===\n")

for col in model_cols:
    print(f"Model: {col}")
    print(merged[col].value_counts())
    print("-" * 40)

In [None]:
# Combined table: emotions Ã— models
stats = {}

for col in model_cols:
    stats[col] = merged[col].value_counts()

stats_df = pd.DataFrame(stats).fillna(0).astype(int)

print("=== Emotion frequency per model (rows = emotions, columns = models) ===")
print(stats_df)

# Consensus distribution
print("\n=== Consensus label distribution ===")
print(merged["consensus_emotion"].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
order = merged["consensus_emotion"].value_counts().index

sns.barplot(
    x=merged["consensus_emotion"].value_counts().index,
    y=merged["consensus_emotion"].value_counts().values,
    palette="viridis"
)

plt.title("Consensus Emotion Distribution", fontsize=16)
plt.xlabel("Emotion", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))

strength = merged["consensus_count"].value_counts().sort_index()

sns.barplot(
    x=strength.index,
    y=strength.values,
    palette="magma"
)

plt.title("Consensus Strength (Number of Models Agreeing)", fontsize=16)
plt.xlabel("Consensus Count (Votes)", fontsize=14)
plt.ylabel("Number of Sentences", fontsize=14)
plt.xticks(strength.index)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ----------------------------------------------------
# 1. Make a full copy before remapping
# ----------------------------------------------------
merged_final = merged.copy()
print("Original merged shape:", merged.shape)
print("New merged_final shape:", merged_final.shape)

# ----------------------------------------------------
# 2. Define final emotion set
# ----------------------------------------------------
FINAL_EMOTIONS = [
    "neutral", "anger", "joy", "sadness", "fear", "surprise", "disgust"
]

# ----------------------------------------------------
# 3. Fine-grained â†’ Basic emotion mapping
# ----------------------------------------------------
emotion_mapping = {
    # Joy cluster
    "joy": "joy",
    "amusement": "joy",
    "excitement": "joy",
    "optimism": "joy",
    "gratitude": "joy",
    "pride": "joy",
    "love": "joy",
    "admiration": "joy",
    "approval": "joy",
    "caring": "joy",
    "desire": "joy",

    # Anger cluster
    "anger": "anger",
    "annoyance": "anger",
    "disapproval": "anger",
    "disappointment": "anger",  # Could be sadness; restaurant domain fits anger

    # Sadness cluster
    "sadness": "sadness",
    "remorse": "sadness",
    "embarrassment": "sadness",

    # Fear cluster
    "fear": "fear",
    "nervousness": "fear",

    # Surprise cluster
    "surprise": "surprise",
    "realization": "surprise",
    "confusion": "surprise",

    # Disgust cluster
    "disgust": "disgust",
    "repulsion": "disgust",

    # Neutral
    "neutral": "neutral",
}

# Safety fallback: unmapped â†’ neutral
def map_final_label(x):
    return emotion_mapping.get(x, "neutral")

# ----------------------------------------------------
# 4. Apply mapping
# ----------------------------------------------------
merged_final["emotion_final"] = merged_final["consensus_emotion"].apply(map_final_label)

# ----------------------------------------------------
# 5. BEFORE â†’ AFTER distributions
# ----------------------------------------------------
print("=== BEFORE (Consensus Emotion Distribution) ===")
print(merged_final["consensus_emotion"].value_counts())

print("\n=== AFTER (Final 7-Class Emotion Distribution) ===")
print(merged_final["emotion_final"].value_counts())

# ----------------------------------------------------
# 6. Check if any labels were not mapped
# ----------------------------------------------------
unmapped = merged_final[
    ~merged_final["consensus_emotion"].isin(emotion_mapping.keys())
]["consensus_emotion"].unique()

print("\nUnmapped labels:", unmapped)

# Should output: []


# ----------------------------------------------------
# 7. Visualization of final 7-class distribution
# ----------------------------------------------------
plt.figure(figsize=(10, 6))
sns.barplot(
    x=merged_final["emotion_final"].value_counts().index,
    y=merged_final["emotion_final"].value_counts().values,
    palette="coolwarm_r"
)

plt.title("Final 7-Emotion Distribution (After Mapping)", fontsize=16)
plt.xlabel("Emotion", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
from emotion_pipeline_new import annotate
df = pd.DataFrame({
    "sentence": ["Service was slow but the staff were very friendly."],
    "aspect_term": [["service", "staff"]]
})

preds = annotate(df)
for key, value in preds.items():
    print(f"{key}: {value}")

In [None]:
from transformers import pipeline

model_name = "cardiffnlp/twitter-roberta-base-emotion"
pipe = pipeline("text-classification", model=model_name, tokenizer=model_name)

texts = [
    "[ASPECT] service [SENTENCE] Service was slow but the staff were very friendly.",
    "[ASPECT] staff [SENTENCE] Service was slow but the staff were very friendly."
]

pipe(texts)