Lets use MAMS dataset

In [19]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
import os

try:
    # Running as normal Python script inside src/
    this_file = os.path.abspath(__file__)
    src_root = os.path.dirname(this_file)                        # EMOTION-PRED/src
    project_root = os.path.dirname(src_root)                    # EMOTION-PRED/
except NameError:
    # Running inside Jupyter (likely src/notebooks or src/)
    cwd = os.getcwd()

    # If running inside src/notebooks ‚Üí go up one level
    if cwd.endswith("notebooks"):
        src_root = os.path.abspath(os.path.join(cwd, ".."))
        project_root = os.path.dirname(src_root)
    else:
        # Running from project root directly
        project_root = cwd
        src_root = os.path.join(project_root, "src")

# Final unified paths
results_root = os.path.join(src_root, "results")
data_root = os.path.join(src_root, "data")
print(f"üìÇ Project root: {project_root}"
      f"\nüìÇ Source root: {src_root}"
      f"\nüìÇ Results root: {results_root}"
      f"\nüìÇ Data root: {data_root}")

üìÇ Project root: /Users/hd/Desktop/EMOTION-PRED
üìÇ Source root: /Users/hd/Desktop/EMOTION-PRED/src
üìÇ Results root: /Users/hd/Desktop/EMOTION-PRED/src/results
üìÇ Data root: /Users/hd/Desktop/EMOTION-PRED/src/data


In [21]:
from datasets import load_dataset

# load the dataset
ds = load_dataset("NEUDM/mams")

# print dataset info
print(ds)

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['task_type', 'dataset', 'input', 'output', 'situation', 'label', 'extra', 'instruction'],
        num_rows: 7446
    })
    validation: Dataset({
        features: ['task_type', 'dataset', 'input', 'output', 'situation', 'label', 'extra', 'instruction'],
        num_rows: 900
    })
    test: Dataset({
        features: ['task_type', 'dataset', 'input', 'output', 'situation', 'label', 'extra', 'instruction'],
        num_rows: 900
    })
})


In [22]:
train_ds = ds["train"]
train_ds = train_ds.to_pandas()


# inspect row and column count
print(f"Number of rows: {train_ds.shape[0]}")
print(f"Number of columns: {train_ds.shape[1]}")

# inspect column names
print("Column names:", train_ds.columns.tolist())

Number of rows: 7446
Number of columns: 8
Column names: ['task_type', 'dataset', 'input', 'output', 'situation', 'label', 'extra', 'instruction']


In [23]:
# Columns to keep
keep_cols = ["input", "output"]
clean_df = train_ds[keep_cols].copy()


In [24]:
import ast
import pandas as pd

# Take 50 rows
df_head = clean_df["input"].head(50).to_frame()

# 1) Convert "['text']" ‚Üí "text"
def clean_list_string(x):
    try:
        parsed = ast.literal_eval(x)   # safely parse list-string
        if isinstance(parsed, list) and len(parsed) > 0:
            return parsed[0]
        return x
    except:
        return x

df_head["sentence"] = df_head["input"].apply(clean_list_string)

# 2) Add empty aspect column (required by pipeline)
df_head["aspect_term"] = ""

# 3) Save in the correct 2-column format
input_csv = os.path.join(data_root, "temp_50.csv")
df_head[["sentence", "aspect_term"]].to_csv(input_csv, index=False)


In [25]:
import sys, os

# Add src/ to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [26]:
from emotion import run_full_emotion_pipeline

run_full_emotion_pipeline(
    input_csv=input_csv,
    dataset_name="sample50",
    results_root=results_root,
)


üöÄ Starting full emotion pipeline

üìÅ Saving outputs to: /Users/hd/Desktop/EMOTION-PRED/src/results/emotion_sample50

üîπ Annotating with: j-hartmann/emotion-english-distilroberta-base
  üîç Model type: roberta | arch=['RobertaForSequenceClassification']
   ‚úÖ Saved ‚Üí /Users/hd/Desktop/EMOTION-PRED/src/results/emotion_sample50/j_hartmann_emotion_english_distilroberta_base_annotated.csv

üîπ Annotating with: j-hartmann/emotion-english-roberta-large
  üîç Model type: roberta | arch=['RobertaForSequenceClassification']
   ‚úÖ Saved ‚Üí /Users/hd/Desktop/EMOTION-PRED/src/results/emotion_sample50/j_hartmann_emotion_english_roberta_large_annotated.csv

üîπ Annotating with: nateraw/bert-base-uncased-emotion
  üîç Model type: bert | arch=['BertForSequenceClassification']
   ‚úÖ Saved ‚Üí /Users/hd/Desktop/EMOTION-PRED/src/results/emotion_sample50/nateraw_bert_base_uncased_emotion_annotated.csv

üîπ Annotating with: joeddav/distilbert-base-uncased-go-emotions-student
  üîç Model

In [27]:
import os
import pandas as pd

RESULT_DIR = os.path.join(results_root, "emotion_sample50")

csv_files = [f for f in os.listdir(RESULT_DIR) if f.endswith(".csv")]

MODEL_NAME_MAP = {
    "j_hartmann_emotion_english_roberta_large": "roberta_large",
    "nateraw_bert_base_uncased_emotion": "bert_base_emotion",
    "j_hartmann_emotion_english_distilroberta_base": "roberta_emotion",
    "joeddav_distilbert_base_uncased_go_emotions_student": "go_emotions",
    "cardiffnlp_twitter_roberta_base_emotion": "tweet_eval",
    "mrm8488_t5_base_finetuned_emotion": "t5_emotion"
}

merged = pd.DataFrame()

for f in csv_files:
    full_path = os.path.join(RESULT_DIR, f)
    df = pd.read_csv(full_path)

    base = f.replace("_annotated.csv", "")
    colname = MODEL_NAME_MAP.get(base, base)   # fallback = raw name if missing

    merged[colname] = df["emotion_auto"]

pd.set_option("display.max_colwidth", None)
print(merged.head(20))

   roberta_large bert_base_emotion roberta_emotion t5_emotion  go_emotions  \
0        neutral               joy         neutral        joy       caring   
1        neutral               joy         neutral        joy       caring   
2          anger             anger         neutral      anger  disapproval   
3        neutral               joy         neutral        joy       caring   
4          anger             anger         neutral      anger    annoyance   
5        neutral               joy         neutral        joy       caring   
6       surprise               joy        surprise        joy     surprise   
7        neutral               joy         neutral      anger       caring   
8          anger             anger            fear      anger  nervousness   
9        neutral             anger         neutral       love       caring   
10       neutral              love         neutral       love       caring   
11       neutral             anger         neutral      anger  r