Lets use MAMS dataset

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os

try:
    # Running as normal Python script inside src/
    this_file = os.path.abspath(__file__)
    src_root = os.path.dirname(this_file)                        # EMOTION-PRED/src
    project_root = os.path.dirname(src_root)                    # EMOTION-PRED/
except NameError:
    # Running inside Jupyter (likely src/notebooks or src/)
    cwd = os.getcwd()

    # If running inside src/notebooks â†’ go up one level
    if cwd.endswith("notebooks"):
        src_root = os.path.abspath(os.path.join(cwd, ".."))
        project_root = os.path.dirname(src_root)
    else:
        # Running from project root directly
        project_root = cwd
        src_root = os.path.join(project_root, "src")

# Final unified paths
results_root = os.path.join(src_root, "results")
data_root = os.path.join(src_root, "data")
print(f"ðŸ“‚ Project root: {project_root}"
      f"\nðŸ“‚ Source root: {src_root}"
      f"\nðŸ“‚ Results root: {results_root}"
      f"\nðŸ“‚ Data root: {data_root}")

In [None]:
from datasets import load_dataset

# load the dataset
ds = load_dataset("NEUDM/mams")

# print dataset info
print(ds)

In [None]:
train_ds = ds["train"]
train_ds = train_ds.to_pandas()


# inspect row and column count
print(f"Number of rows: {train_ds.shape[0]}")
print(f"Number of columns: {train_ds.shape[1]}")

# inspect column names
print("Column names:", train_ds.columns.tolist())

In [None]:
# Columns to keep
keep_cols = ["input", "output"]
clean_df = train_ds[keep_cols].copy()


In [None]:
import ast
import pandas as pd

# Take 50 rows
df_head = clean_df["input"].head(50).to_frame()

# 1) Convert "['text']" â†’ "text"
def clean_list_string(x):
    try:
        parsed = ast.literal_eval(x)   # safely parse list-string
        if isinstance(parsed, list) and len(parsed) > 0:
            return parsed[0]
        return x
    except:
        return x

df_head["sentence"] = df_head["input"].apply(clean_list_string)

# 2) Add empty aspect column (required by pipeline)
df_head["aspect_term"] = ""

# 3) Save in the correct 2-column format
input_csv = os.path.join(data_root, "temp_50.csv")
df_head[["sentence", "aspect_term"]].to_csv(input_csv, index=False)


In [None]:
import sys, os

# Add src/ to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [None]:
from emotion_pipeline import run_full_emotion_pipeline

run_full_emotion_pipeline(
    input_csv=input_csv,
    dataset_name="sample50",
    results_root=results_root,
)

In [None]:
import os
import pandas as pd

RESULT_DIR = os.path.join(results_root, "emotion_sample50")

csv_files = [f for f in os.listdir(RESULT_DIR) if f.endswith(".csv")]

MODEL_NAME_MAP = {
    "j_hartmann_emotion_english_roberta_large": "roberta_large",
    "nateraw_bert_base_uncased_emotion": "bert_base_emotion",
    "j_hartmann_emotion_english_distilroberta_base": "roberta_emotion",
    "joeddav_distilbert_base_uncased_go_emotions_student": "go_emotions",
    "cardiffnlp_twitter_roberta_base_emotion": "tweet_eval",
    "mrm8488_t5_base_finetuned_emotion": "t5_emotion"
}

merged = pd.DataFrame()

for f in csv_files:
    full_path = os.path.join(RESULT_DIR, f)
    df = pd.read_csv(full_path)

    base = f.replace("_annotated.csv", "")
    colname = MODEL_NAME_MAP.get(base, base)   # fallback = raw name if missing

    merged[colname] = df["emotion_auto"]

pd.set_option("display.max_colwidth", None)
print(merged.head(20))