# PulseSearch — Emotion Analysis
Loads the `google-research-datasets/go_emotions` dataset directly, fine-tunes DistilBERT on it, then classifies song lyrics.

**Output per song:**
```json
{
  "artist": "The Weeknd",
  "title": "Blinding Lights",
  "lyrics": "...",
  "emotions": [
    { "label": "joy", "score": 0.72 },
    { "label": "longing", "score": 0.41 }
  ],
  "embedding": [0.12, -0.34, ...]
}
```

In [None]:
%pip install datasets

Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl (515 kB)
[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m276.5/515.2 KB[0m [31m357.1 kB/s[0m eta [36m0:00:01[0m

## 1. Load go_emotions dataset

In [None]:
from datasets import load_dataset

# 'simplified' split uses 28 emotions (vs raw which has 27 + neutral collapsed differently)
ds = load_dataset("google-research-datasets/go_emotions", "simplified")

print(ds)
print("\nExample entry:")
print(ds["train"][0])

ModuleNotFoundError: No module named 'datasets'

In [None]:
# The 28 emotion labels in order — indices from the dataset map to these
EMOTION_LABELS = [
    "admiration", "amusement", "anger", "annoyance", "approval",
    "caring", "confusion", "curiosity", "desire", "disappointment",
    "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness",
    "optimism", "pride", "realization", "relief", "remorse",
    "sadness", "surprise", "neutral"
]

NUM_LABELS = len(EMOTION_LABELS)
print(f"{NUM_LABELS} emotion labels:", EMOTION_LABELS)

## 2. Prepare dataset for multi-label classification

In [None]:
import torch
from transformers import AutoTokenizer

MODEL_CHECKPOINT = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def encode(batch):
    encoded = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    # Convert list of label indices → multi-hot vector
    multi_hot = [[1.0 if i in label_list else 0.0 for i in range(NUM_LABELS)]
                 for label_list in batch["labels"]]
    encoded["labels"] = multi_hot
    return encoded

tokenized = ds.map(encode, batched=True, remove_columns=["text", "id"])
tokenized.set_format("torch")

print("Tokenized dataset:", tokenized)

## 3. Fine-tune DistilBERT on go_emotions

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification",
    id2label={i: l for i, l in enumerate(EMOTION_LABELS)},
    label2id={l: i for i, l in enumerate(EMOTION_LABELS)},
)

training_args = TrainingArguments(
    output_dir="../go_emotions_model",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=100,
    fp16=torch.cuda.is_available(),   # use GPU half-precision if available (Databricks)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
)

trainer.train()

In [None]:
# Save fine-tuned model — saves to ml/go_emotions_model/ for use by main.py
model.save_pretrained("../go_emotions_model")
tokenizer.save_pretrained("../go_emotions_model")
print("Model saved to ml/go_emotions_model")

## 4. Classify lyrics with the fine-tuned model

In [None]:
from transformers import pipeline

# Load from saved checkpoint (re-run from here after training)
emotion_classifier = pipeline(
    "text-classification",
    model="../go_emotions_model",
    tokenizer="../go_emotions_model",
    top_k=None,
    truncation=True,
    max_length=128,
)

print("Classifier ready.")

In [None]:
def chunk_lyrics(lyrics: str, max_words: int = 80) -> list[str]:
    """Split lyrics into chunks that fit the model's 128-token window."""
    lines = lyrics.strip().split("\n")
    chunks, current, current_len = [], [], 0

    for line in lines:
        n = len(line.split())
        if current_len + n > max_words:
            if current:
                chunks.append(" ".join(current))
            current, current_len = [line], n
        else:
            current.append(line)
            current_len += n

    if current:
        chunks.append(" ".join(current))

    return chunks or [lyrics]


def classify_lyrics(lyrics: str, min_score: float = 0.10) -> list[dict]:
    """
    Classify emotions in song lyrics using the go_emotions fine-tuned model.
    Averages scores across lyric chunks and returns emotions above min_score.

    Returns: [{"label": str, "score": float}, ...] sorted by score desc
    """
    chunks = chunk_lyrics(lyrics)
    all_results = emotion_classifier(chunks)  # list of lists

    label_scores: dict[str, list[float]] = {}
    for chunk_result in all_results:
        for item in chunk_result:
            label_scores.setdefault(item["label"], []).append(item["score"])

    averaged = [
        {"label": label, "score": round(float(np.mean(scores)), 4)}
        for label, scores in label_scores.items()
    ]

    filtered = [e for e in averaged if e["score"] >= min_score]
    return sorted(filtered, key=lambda x: x["score"], reverse=True)

In [None]:
# --- Test it ---
sample_lyrics = """
I been running through the jungle, I been crying with the wolves
To get to you, to get to you
I been down the darkest alleys, saw the dark side of the moon
To get to you, to get to you
"""

emotions = classify_lyrics(sample_lyrics)
print("Emotions:")
for e in emotions:
    print(f"  {e['label']}: {e['score']}")

## 5. Song similarity matching

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

def embed_lyrics(lyrics: str) -> list[float]:
    """384-dim normalized embedding for cosine similarity matching."""
    return embedder.encode(lyrics, normalize_embeddings=True).tolist()


def cosine_similarity(a: list[float], b: list[float]) -> float:
    a, b = np.array(a), np.array(b)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))


def find_similar_songs(query_lyrics: str, song_library: list[dict], top_k: int = 5) -> list[dict]:
    """
    Find songs with similar lyric content.
    song_library: list of Firestore docs — each needs an 'embedding' field.
    """
    query_embedding = embed_lyrics(query_lyrics)
    scored = [
        {
            "artist": s["artist"],
            "title": s["title"],
            "emotions": s.get("emotions", []),
            "similarity": cosine_similarity(query_embedding, s["embedding"])
        }
        for s in song_library if "embedding" in s
    ]
    return sorted(scored, key=lambda x: x["similarity"], reverse=True)[:top_k]

In [None]:
# --- Full pipeline — returns the object your teammate writes to Firestore ---

def analyze_song(artist: str, title: str, lyrics: str) -> dict:
    return {
        "artist": artist,
        "title": title,
        "lyrics": lyrics,
        "emotions": classify_lyrics(lyrics),
        "embedding": embed_lyrics(lyrics),   # ask teammate to store this for similarity search
    }


result = analyze_song(
    artist="Imagine Dragons",
    title="Believer",
    lyrics=sample_lyrics
)

print(f"{result['artist']} — {result['title']}")
print("Top emotions:", result["emotions"][:5])
print("Embedding dims:", len(result["embedding"]))