Lets use MAMS dataset

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os

try:
    # Running as normal Python script inside src/
    this_file = os.path.abspath(__file__)
    src_root = os.path.dirname(this_file)                        # EMOTION-PRED/src
    project_root = os.path.dirname(src_root)                    # EMOTION-PRED/
except NameError:
    # Running inside Jupyter (likely src/notebooks or src/)
    cwd = os.getcwd()

    # If running inside src/notebooks â†’ go up one level
    if cwd.endswith("notebooks"):
        src_root = os.path.abspath(os.path.join(cwd, ".."))
        project_root = os.path.dirname(src_root)
    else:
        # Running from project root directly
        project_root = cwd
        src_root = os.path.join(project_root, "src")

# Final unified paths
results_root = os.path.join(src_root, "results")
data_root = os.path.join(src_root, "data")
print(f"ðŸ“‚ Project root: {project_root}"
      f"\nðŸ“‚ Source root: {src_root}"
      f"\nðŸ“‚ Results root: {results_root}"
      f"\nðŸ“‚ Data root: {data_root}")

In [None]:
from datasets import load_dataset

# load the dataset
ds = load_dataset("NEUDM/mams")

# print dataset info
print(ds)

In [None]:
train_ds = ds["train"]
train_ds = train_ds.to_pandas()


# inspect row and column count
print(f"Number of rows: {train_ds.shape[0]}")
print(f"Number of columns: {train_ds.shape[1]}")

# inspect column names
print("Column names:", train_ds.columns.tolist())

In [None]:
# Columns to keep
keep_cols = ["input", "output"]
clean_df = train_ds[keep_cols].copy()


In [19]:
import os
import ast
import pandas as pd

# -----------------------------------------
# 1. Target CSV location
# -----------------------------------------
input_csv = os.path.join(data_root, "mams_train_full.csv")
dataset_name = input_csv.split(".")[-2].split("/")[-1] # mams_train_full
# -----------------------------------------
# 2. If exists â†’ reuse it
# -----------------------------------------
if os.path.exists(input_csv):
    print(f"Using existing file: {input_csv}")
    df_preview = pd.read_csv(input_csv)
    print(df_preview.head())
else:
    print("File not found â†’ building mams_train_full.csv ...")

    # ---- A) Copy dataset ----
    df_full = clean_df.copy()

    # ---- B) Stable row ID ----
    df_full["row_id"] = df_full.index

    # ---- C) Clean string format "['text']" â†’ "text" ----
    def clean_list_string(x):
        try:
            parsed = ast.literal_eval(x)
            if isinstance(parsed, list) and len(parsed) > 0:
                return parsed[0]
            return x
        except:
            return x

    df_full["sentence"] = df_full["input"].apply(clean_list_string)

    # ---- D) Empty aspect column (pipeline requirement) ----
    df_full["aspect_term"] = ""

    # ---- E) Save final dataset ----
    df_full[["row_id", "sentence", "aspect_term"]].to_csv(
        input_csv, index=False
    )

    print(f"âœ” Built and saved: {input_csv}")
    print(df_full[["row_id", "sentence", "aspect_term"]].head())

Using existing file: /Users/hd/Desktop/EMOTION-PRED/src/data/mams_train_full.csv
   row_id                                           sentence  aspect_term
0       0  ["It might be the best sit down food I've had ...          NaN
1       1  ['Hostess was extremely accommodating when we ...          NaN
2       2  ["We were a couple of minutes late for our res...          NaN
3       3  ['Though the service might be a little slow, t...          NaN
4       4  ['Although we arrived at the restaurant 10 min...          NaN


In [None]:
import sys, os

# Add src/ to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [None]:
# from emotion_pipeline import run_full_emotion_pipeline

# run_full_emotion_pipeline(
#     input_csv=input_csv,
#     dataset_name="mams_train_full",
#     results_root=results_root,
# )

from emotion_pipeline_optimized import run_full_emotion_pipeline
import time

start = time.time()

run_full_emotion_pipeline(
    input_csv=input_csv,
    dataset_name="mams_train_full",
    results_root=results_root,
)

end = time.time()
print("External total:", end - start)

In [22]:
import os
import pandas as pd

RESULT_DIR = os.path.join(results_root, f"emotion_{dataset_name}")  # or full dataset
csv_files = [f for f in os.listdir(RESULT_DIR) if f.endswith(".csv")]

MODEL_NAME_MAP = {
    "j_hartmann_emotion_english_roberta_large": "roberta_large",
    "nateraw_bert_base_uncased_emotion": "bert_base_emotion",
    "j_hartmann_emotion_english_distilroberta_base": "roberta_emotion",
    "joeddav_distilbert_base_uncased_go_emotions_student": "go_emotions",
    "cardiffnlp_twitter_roberta_base_emotion": "tweet_eval",
    "mrm8488_t5_base_finetuned_emotion": "t5_emotion",
    "SamLowe_roberta_base_go_emotions": "go_emotions_roberta",
}

merged = None

for f in csv_files:
    full_path = os.path.join(RESULT_DIR, f)
    df = pd.read_csv(full_path)

    # get model name
    base = f.replace("_annotated.csv", "")
    colname = MODEL_NAME_MAP.get(base, base)

    # keep only row_id + emotion_auto
    df = df[["row_id", "emotion_auto"]].rename(columns={
        "emotion_auto": colname
    })

    # first file initializes
    if merged is None:
        merged = df
    else:
        merged = merged.merge(df, on="row_id", how="inner")

pd.set_option("display.max_colwidth", None)
print(merged.head(5))

   row_id go_emotions_roberta roberta_large bert_base_emotion roberta_emotion  \
0       0             neutral       neutral               joy         neutral   
1       1             neutral       neutral               joy         neutral   
2       2             neutral         anger             anger         neutral   
3       3             neutral       neutral               joy         neutral   
4       4             neutral         anger             anger         neutral   

  t5_emotion  go_emotions tweet_eval  
0        joy       caring        joy  
1        joy       caring      anger  
2      anger  disapproval      anger  
3        joy       caring      anger  
4      anger    annoyance      anger  


In [None]:
import pandas as pd

# 1) columns to use for voting
model_cols = [
    "roberta_large",
    "bert_base_emotion",
    "roberta_emotion",
    "go_emotions",
    "tweet_eval",
    "t5_emotion",
    "go_emotions_roberta",
]

# 2) majority vote function
def get_majority_emotion(row):
    votes = []

    for col in model_cols:
        if col in row and pd.notna(row[col]):
            votes.append(row[col])

    if len(votes) == 0:
        return None

    counts = {}
    for emo in votes:
        if emo not in counts:
            counts[emo] = 0
        counts[emo] += 1

    majority_emotion = max(counts.items(), key=lambda x: x[1])[0]
    return majority_emotion

# 3) apply to all rows
merged["consensus_emotion"] = merged.apply(get_majority_emotion, axis=1)

# 4) quick check
merged[["row_id"] + model_cols + ["consensus_emotion"]].head(10)

In [None]:
print("=== Per-model emotion counts ===\n")

for col in model_cols:
    print(f"Model: {col}")
    print(merged[col].value_counts())
    print("-" * 40)

In [None]:
# Combined table: emotions Ã— models
stats = {}

for col in model_cols:
    stats[col] = merged[col].value_counts()

stats_df = pd.DataFrame(stats).fillna(0).astype(int)

print("=== Emotion frequency per model (rows = emotions, columns = models) ===")
print(stats_df)

# Consensus distribution
print("\n=== Consensus label distribution ===")
print(merged["consensus_emotion"].value_counts())