In [None]:
import os

try:
    # Running as normal Python script inside src/
    this_file = os.path.abspath(__file__)
    src_root = os.path.dirname(this_file)                        # EMOTION-PRED/src
    project_root = os.path.dirname(src_root)                    # EMOTION-PRED/
except NameError:
    # Running inside Jupyter (likely src/notebooks or src/)
    cwd = os.getcwd()

    # If running inside src/notebooks ‚Üí go up one level
    if cwd.endswith("notebooks"):
        src_root = os.path.abspath(os.path.join(cwd, ".."))
        project_root = os.path.dirname(src_root)
    else:
        # Running from project root directly
        project_root = cwd
        src_root = os.path.join(project_root, "src")

# Final unified paths
results_root = os.path.join(src_root, "results")
data_root = os.path.join(src_root, "data","MAMS-ACSA","raw","data_jsonl")
print(f"üìÇ Project root: {project_root}"
      f"\nüìÇ Source root: {src_root}"
      f"\nüìÇ Results root: {results_root}"
      f"\nüìÇ Data root: {data_root}")

In [None]:
file = os.path.join(results_root,"emotion_MAMS-ACSA","joeddav_distilbert_base_uncased_go_emotions_student_annotated.jsonl")

In [None]:
import json

go_data = []

with open(file, "r", encoding="utf-8") as f:
    for line in f:
        go_data.append(json.loads(line))


In [None]:
go_data[0]


In [None]:
aspect_counts = [len(entry["output"]) for entry in go_data]


from collections import Counter

aspect_count_distribution = Counter(aspect_counts)
aspect_count_distribution


In [None]:
go_emotions_list = []

for entry in go_data:
    for item in entry["output"]:
        go_emotions_list.append(item["emotion"])


In [None]:
sorted(set(go_emotions_list))

In [None]:
from collections import Counter

go_counts = Counter(go_emotions_list)
go_counts_sorted = dict(sorted(go_counts.items()))
go_counts_sorted

In [None]:
pairs = []

for entry in go_data:
    for item in entry["output"]:
        emotion = item["emotion"]
        polarity = item["polarity"]
        pairs.append((emotion, polarity))

import pandas as pd
df_pairs = pd.DataFrame(pairs, columns=["emotion", "polarity"])
pivot = df_pairs.pivot_table(
    index="emotion",
    columns="polarity",
    aggfunc="size",
    fill_value=0
)
pivot["total"] = pivot.sum(axis=1)
pivot


In [None]:
pivot["neg_pct"] = pivot["negative"] / pivot["total"]
pivot["neu_pct"] = pivot["neutral"] / pivot["total"]
pivot["pos_pct"] = pivot["positive"] / pivot["total"]
pivot[["neg_pct", "neu_pct", "pos_pct"]].head(30)


In [None]:
pivot["purity"] = pivot[["neg_pct", "neu_pct", "pos_pct"]].max(axis=1)
pivot["purity"].sort_values(ascending=False).head()
pivot["purity"].sort_values().head()

# Basically least pure are those with near equal distribution across all three polarities

In [None]:
pivot["purity"].sort_values(ascending=False).head()
# top 5 most pure emotions are just top five max values 

In [None]:
from sklearn.cluster import KMeans
import pandas as pd

X = pivot[["neg_pct", "neu_pct", "pos_pct"]].values
emotion_names = pivot.index.tolist()
emotion_names

kmeans = KMeans(n_clusters=8, random_state=42)
labels = kmeans.fit_predict(X)

pivot["cluster"] = labels
pivot[["neg_pct", "neu_pct", "pos_pct", "cluster"]]

cluster_groups = pivot.groupby("cluster").apply(lambda df: df.index.tolist())
cluster_groups




# ‚≠ê **Cluster 0 ‚Äî Mild Negative / Complaints**

```
[annoyance, disappointment, disapproval, embarrassment, nervousness, surprise]
```

### Meaning:

* ‚ÄúSomething is wrong.‚Äù
* Complaints, frustration, discomfort.
* Not extreme negativity, but negative vibes.

---

# ‚≠ê **Cluster 1 ‚Äî Clear Positive Feelings**

```
[approval, gratitude, joy, pride]
```

### Meaning:

* Happy, satisfied, thankful.
* Direct positive reactions.

---

# ‚≠ê **Cluster 2 ‚Äî Neutral / Thinking / Mixed Emotions**

```
[amusement, curiosity, desire, neutral, realization, remorse]
```

### Meaning:

* Emotions that don‚Äôt point strongly to good or bad.
* Cognitive states (thinking, wanting, reflecting).
* Very mixed polarity ‚Üí unclear sentiment.

---

# ‚≠ê **Cluster 3 ‚Äî Warm Positive Emotions**

```
[admiration, love, relief]
```

### Meaning:

* Emotional warmth.
* ‚ÄúI feel good about you/this.‚Äù

---

# ‚≠ê **Cluster 4 ‚Äî Strong Negative Emotions**

```
[anger, fear]
```

### Meaning:

* High intensity negativity.
* Threat, danger, rage.

---

# ‚≠ê **Cluster 5 ‚Äî Soft Emotional Engagement**

```
[caring, excitement, remorse]
```

### Meaning:

* Personal involvement.
* Not clearly positive or negative.
* Emotional activation (care, excitement, regret).

---

# ‚≠ê **Cluster 6 ‚Äî Pure Negative / Suffering**

```
[disgust, sadness]
```

### Meaning:

* Deep unpleasant feelings.
* Pain, rejection, emotional hurt.

---

# ‚≠ê **Cluster 7 ‚Äî Confusion (Alone)**

```
[confusion]
```

### Meaning:

* Confusion behaves differently from everything else.
* Mixed polarity ‚Üí unique pattern.
* KMeans isolates it.

---

# üéØ **SUPER SIMPLE SUMMARY**

| Cluster | Meaning                    |
| ------- | -------------------------- |
| 0       | complaints / irritation    |
| 1       | happy feelings             |
| 2       | thinking / neutral / mixed |
| 3       | warm positive emotions     |
| 4       | strong negative reactions  |
| 5       | soft emotional involvement |
| 6       | deep negative suffering    |
| 7       | confusion alone            |

---

# üß† Why this is useful

* 28 emotions reduced to **8 clear groups**
* Each group has **similar polarity behavior**
* You now have **clean categories** for your emotion model
* Much more stable than using all 28 emotions


In [None]:
cluster_name_map = {
    0: "complaint",
    1: "positive_appreciation",
    2: "cognitive_neutral",
    3: "warm_positive",
    4: "intense_negative",
    5: "soft_emotional",
    6: "pure_negative",
    7: "confusion"
}

emotion_to_cluster = {}

for idx, row in pivot.iterrows():
    emotion = idx
    cluster_id = row["cluster"]
    emotion_to_cluster[emotion] = cluster_name_map[cluster_id]

emotion_to_cluster


In [None]:
cluster_to_emotions = {
    "complaint": [
        "annoyance",
        "disappointment",
        "disapproval",
        "embarrassment",
        "nervousness"
    ],

    "positive_appreciation": [
        "approval",
        "gratitude",
        "joy",
        "pride"
    ],

    "cognitive_neutral": [
        "amusement",
        "curiosity",
        "desire",
        "neutral",
        "realization",
        "surprise"
    ],

    "warm_positive": [
        "admiration",
        "love",
        "relief"
    ],

    "intense_negative": [
        "anger",
        "fear"
    ],

    "soft_emotional": [
        "caring",
        "excitement",
        "remorse"
    ],

    "pure_negative": [
        "disgust",
        "sadness"
    ],

    "confusion": [
        "confusion"
    ]
}


In [None]:
import os
import json

# paths
data_root = os.path.join(src_root, "data", "MAMS-ACSA", "raw", "data_jsonl")
output_path = os.path.join(data_root, "train_clustered_emotion.jsonl")

# 1. Replace emotion with its cluster
for entry in go_data:
    for item in entry["output"]:
        original_emo = item["emotion"]
        cluster = emotion_to_cluster[original_emo]
        item["emotion"] = cluster  # overwrite original emotion

# 2. Write updated JSONL file
with open(output_path, "w", encoding="utf-8") as f:
    for entry in go_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print("Saved:", output_path)



### **1Ô∏è‚É£ WHAT:** Loaded the MAMS JSONL dataset.

**WHY:** Needed raw inputs + model-predicted emotions for analysis.
**HOW:** Read each JSONL line into Python and stored as dictionaries.

---

### **2Ô∏è‚É£ WHAT:** Extracted all emotions tied to each aspect.

**WHY:** To understand how emotions are distributed across the dataset.
**HOW:** Iterated through each entry and collected emotion labels into lists.

---

### **3Ô∏è‚É£ WHAT:** Counted emotions by polarity (neg/neu/pos).

**WHY:** Emotions behave differently depending on sentiment; we must quantify this.
**HOW:** Built a pivot table using negative/neutral/positive counts per emotion.

---

### **4Ô∏è‚É£ WHAT:** Computed polarity percentages and emotion ‚Äúpurity.‚Äù

**WHY:** To see which emotions strongly lean negative or positive, and which are mixed.
**HOW:** Divided each emotion‚Äôs polarity counts by the total number of occurrences.

---

### **5Ô∏è‚É£ WHAT:** Observed high noise and overlap across 28 emotions.

**WHY:** Many emotions don‚Äôt map cleanly to positive/negative, making them hard to classify.
**HOW:** Inspected purity results and frequency distributions.

---

### **6Ô∏è‚É£ WHAT:** Applied KMeans clustering to emotion polarity patterns.

**WHY:** To reduce 28 chaotic emotions into stable, interpretable groups.
**HOW:** Clustered emotions using their neg%, neu%, pos% as input features.

---

### **7Ô∏è‚É£ WHAT:** Interpreted 8 natural emotion clusters.

**WHY:** Clusters revealed meaningful emotional themes (frustration, joy-like, mixed, etc.).
**HOW:** Looked at which emotions fell together and assigned intuitive cluster names.

---

### **8Ô∏è‚É£ WHAT:** Created two mappings: cluster ‚Üí emotions and emotion ‚Üí cluster.

**WHY:** Needed a clean, error-free way to convert original emotions into our new 8 categories.
**HOW:** Wrote dictionaries for both directions based on clustering output.

---

### **9Ô∏è‚É£ WHAT:** Replaced each original emotion with its cluster label.

**WHY:** This simplifies the dataset and prepares it for model training.
**HOW:** Overwrote ‚Äúemotion‚Äù field in every output object with the cluster name.

---

### **üîü WHAT:** Saved the new dataset as `train_clustered_emotion.jsonl`.

**WHY:** To produce a clean training file for our model with consistent emotion labels.
**HOW:** Wrote each updated entry back into a new JSONL file in the data_root directory.

---

If you want a **super short 5-bullet version**, a **diagram**, or a **team-friendly slide summary**, just say **next**.
