In [23]:
import torch
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


class RobertaAnchorScorer:
    def __init__(self, lexicon_csv_path, threshold=0.7):
        # Load lexicon (wide format: word,hypo,hyper,flow)
        df = pd.read_csv(lexicon_csv_path)
        self.lexicon = {}
        for _, row in df.iterrows():
            word = row["word"].lower().strip()
            self.lexicon[word] = {
                "hypo": float(row.get("hypo", 0.0)),
                "hyper": float(row.get("hyper", 0.0)),
                "flow": float(row.get("flow", 0.0))
            }

        # Anchors = full expanded lexicon
        self.anchor_words = list(self.lexicon.keys())

        # Load SentenceTransformer (better semantic embeddings than raw RoBERTa)
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.threshold = threshold

        # Cache for embeddings
        self.cache = {}

        # Precompute embeddings for all anchor words
        self.anchor_embs = self._encode_texts(self.anchor_words)

    def _encode_texts(self, texts):
        new_texts = [t for t in texts if t not in self.cache]
        if new_texts:
            embs = self.model.encode(new_texts, convert_to_numpy=True)
            for t, e in zip(new_texts, embs):
                self.cache[t] = e
        return np.array([self.cache[t] for t in texts])

    def score_sentence(self, sentence):
        STOPWORDS = set(stopwords.words("english"))
        tokens = [t for t in word_tokenize(sentence.lower()) if t.isalpha() and t not in STOPWORDS]

        if not tokens:
            return {"percentages": {"hypo": 0, "hyper": 0, "flow": 0},
                    "dominant": None,
                    "label": "No valid tokens",
                    "matched": []}

        token_embs = self._encode_texts(tokens)

        scores = {"hypo": 0, "hyper": 0, "flow": 0}
        matched = []

        # Compare each token with ALL anchors
        for i, tok in enumerate(tokens):
            sims = cosine_similarity([token_embs[i]], self.anchor_embs)[0]
            for idx, sim in enumerate(sims):
                if sim >= self.threshold:
                    anchor = self.anchor_words[idx]
                    contrib = self.lexicon[anchor]
                    for state, val in contrib.items():
                        scores[state] += val * sim  # weight by similarity
                    matched.append({
                        "token": tok,
                        "matched_anchor": anchor,
                        "similarity": float(sim)
                    })

        # Normalize into percentages
        total = sum(scores.values())
        if total == 0:
            percentages = {s: 0 for s in scores}
            dominant = None
            label = "No relevant keywords found"
        else:
            percentages = {s: round(v / total * 100, 2) for s, v in scores.items()}
            dominant = max(percentages, key=percentages.get)
            label = (f"Mixed: {percentages['hypo']}% hypo, "
                     f"{percentages['hyper']}% hyper, "
                     f"{percentages['flow']}% flow "
                     f"(dominant: {dominant})")

        return {
            "percentages": percentages,
            "dominant": dominant,
            "label": label,
            "matched": matched
        }


In [24]:
scorer = RobertaAnchorScorer("../data/processed/lexicon_for_sentences.csv")

# Example sentences
print(scorer.score_sentence("I feel calm and relaxed today"))
print(scorer.score_sentence("I am very anxious and scared"))
print(scorer.score_sentence("I feel numb and disconnected"))
print(scorer.score_sentence("Just a random sentence with no keywords"))


{'percentages': {'hypo': np.float32(0.0), 'hyper': np.float32(0.0), 'flow': np.float32(100.0)}, 'dominant': 'flow', 'label': 'Mixed: 0.0% hypo, 0.0% hyper, 100.0% flow (dominant: flow)', 'matched': [{'token': 'calm', 'matched_anchor': 'calm', 'similarity': 1.0}, {'token': 'relaxed', 'matched_anchor': 'relaxed', 'similarity': 0.9999999403953552}]}
{'percentages': {'hypo': np.float32(0.0), 'hyper': np.float32(100.0), 'flow': np.float32(0.0)}, 'dominant': 'hyper', 'label': 'Mixed: 0.0% hypo, 100.0% hyper, 0.0% flow (dominant: hyper)', 'matched': [{'token': 'anxious', 'matched_anchor': 'alarmed', 'similarity': 0.7199091911315918}, {'token': 'anxious', 'matched_anchor': 'anxious', 'similarity': 0.9999999403953552}, {'token': 'anxious', 'matched_anchor': 'fearful', 'similarity': 0.7018227577209473}, {'token': 'scared', 'matched_anchor': 'fearful', 'similarity': 0.7645676136016846}]}
{'percentages': {'hypo': np.float32(73.65), 'hyper': np.float32(0.0), 'flow': np.float32(26.35)}, 'dominant': 

In [25]:
scorer = RobertaAnchorScorer("../data/processed/lexicon_for_sentences.csv")

result = scorer.score_sentence("I was mostly calm but got anxious early in the morning.")

# Pretty printing
print("\n=== Emotion State Analysis ===")
print(f"Hypoarousal:   {result['percentages']['hypo']}%")
print(f"Hyperarousal: {result['percentages']['hyper']}%")
print(f"Flow:         {result['percentages']['flow']}%")
print(f"Dominant State: {result['dominant']}")
print("\n--- Matched Words ---")
for m in result["matched"]:
    print(f"Token: {m['token']:>10} | Anchor: {m['matched_anchor']:>12} | Similarity: {m['similarity']:.3f}")



=== Emotion State Analysis ===
Hypoarousal:   0.0%
Hyperarousal: 71.13999938964844%
Flow:         28.860000610351562%
Dominant State: hyper

--- Matched Words ---
Token:       calm | Anchor:         calm | Similarity: 1.000
Token:    anxious | Anchor:      alarmed | Similarity: 0.720
Token:    anxious | Anchor:      anxious | Similarity: 1.000
Token:    anxious | Anchor:      fearful | Similarity: 0.702


In [26]:
scorer = RobertaAnchorScorer("../data/processed/lexicon_for_sentences.csv")

result = scorer.score_sentence("These are new words.. exasperated, genuine, grateful")

# Pretty printing
print("\n=== Emotion State Analysis ===")
print(f"Hypoarousal:   {result['percentages']['hypo']}%")
print(f"Hyperarousal: {result['percentages']['hyper']}%")
print(f"Flow:         {result['percentages']['flow']}%")
print(f"Dominant State: {result['dominant']}")
print("\n--- Matched Words ---")
for m in result["matched"]:
    print(f"Token: {m['token']:>10} | Anchor: {m['matched_anchor']:>12} | Similarity: {m['similarity']:.3f}")


=== Emotion State Analysis ===
Hypoarousal:   0.0%
Hyperarousal: 51.220001220703125%
Flow:         48.779998779296875%
Dominant State: hyper

--- Matched Words ---
Token: exasperated | Anchor:  exasperated | Similarity: 1.000
Token:   grateful | Anchor:     grateful | Similarity: 1.000
