# Prompt2Song – Text Emotion Encoder

Train an emotion-aware text encoder that powers both prompt understanding and lyric embeddings. This notebook ingests the six-class emotion dataset, fine-tunes a lightweight transformer, and exports reusable helpers for downstream stages.

## Goals
- Inspect the prompt emotion dataset and build consistent label mappings
- Fine-tune a DistilBERT-sized encoder on the six emotion classes
- Export utility functions for prompt/lyric embeddings and persist artifacts for later notebooks

> ⚙️ **Offline-friendly setup**  
Ensure the base model weights (e.g. `distilbert-base-uncased`) are cached locally before running. Set `HF_HOME` or `TRANSFORMERS_CACHE` if you maintain an offline cache.

In [4]:
import os
import json
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset

try:
    from transformers import (
        AutoConfig,
        AutoTokenizer,
        AutoModelForSequenceClassification,
        Trainer,
        TrainingArguments,
    )
except ImportError as exc:
    raise ImportError("Install transformers before running this notebook.") from exc

try:
    from sklearn.metrics import accuracy_score, f1_score, classification_report
except ImportError as exc:
    raise ImportError("Install scikit-learn before running this notebook.") from exc


  from .autonotebook import tqdm as notebook_tqdm


Here we resolve project directories, ensure artifact folders exist, and define constants such as the dataset roots, model name, and random seed.

In [5]:
NOTEBOOK_DIR = Path.cwd().resolve()
if (NOTEBOOK_DIR / "datasets").exists():
    PROJECT_ROOT = NOTEBOOK_DIR
else:
    PROJECT_ROOT = NOTEBOOK_DIR.parent

DATASET_ROOT = PROJECT_ROOT / "datasets" / "emotions_NLP"
ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "text_encoder"
ARTIFACT_ROOT.mkdir(parents=True, exist_ok=True)

MODEL_NAME = "distilbert-base-uncased"
SEED = 13

print(f"Project root: {PROJECT_ROOT}")
print(f"Saving artifacts to: {ARTIFACT_ROOT}")


Project root: /Users/himanshu/Documents/Github/prompt2song
Saving artifacts to: /Users/himanshu/Documents/Github/prompt2song/artifacts/text_encoder


Utility helper that seeds NumPy and PyTorch for reproducibility, then applies it using the configured seed.

In [6]:
def set_seed(seed: int = 13) -> None:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

set_seed(SEED)


Loads the train/val/test splits from disk, cleans whitespace, filters empty rows, and prints a small preview plus dataset sizes.

In [7]:
def load_split(split: str) -> pd.DataFrame:
    path = DATASET_ROOT / f"{split}.txt"
    if not path.exists():
        raise FileNotFoundError(path)
    df = pd.read_csv(path, sep=";", names=["text", "label"], encoding="utf-8")
    df["text"] = df["text"].astype(str).str.strip()
    df["label"] = df["label"].astype(str).str.strip()
    df = df[df["text"].str.len() > 0].reset_index(drop=True)
    return df

train_df = load_split("train")
val_df = load_split("val")
test_df = load_split("test")

print(train_df.head())
print({split: len(df) for split, df in {"train": train_df, "val": val_df, "test": test_df}.items()})


                                                text    label
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger
{'train': 16000, 'val': 2000, 'test': 2000}


Builds label↔id lookup tables, attaches numeric labels to each split, and reports label distribution for sanity checking.

In [None]:
labels = sorted(train_df["label"].unique())
labels_val = sorted(val_df["label"].unique())
labels_test = sorted(test_df["label"].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

label2id_val = {label: idx for idx, label in enumerate(labels_val)}
id2label_val = {idx: label for label, idx in label2id_val.items()}

label2id_test = {label: idx for idx, label in enumerate(labels_test)}
id2label_test = {idx: label for label, idx in label2id_test.items()}

for df in (train_df, val_df, test_df):
    df["label_id"] = df["label"].map(label2id)

print("Label mapping:", label2id)
print("Train distribution:", train_df["label"].value_counts())
print("Val distribution:", val_df["label"].value_counts())
print("Test distribution:", test_df["label"].value_counts())


Label mapping: {'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}
Train distribution: label
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64
Val distribution: label
joy         704
sadness     550
anger       275
fear        212
love        178
surprise     81
Name: count, dtype: int64
Test distribution: label
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64


Initializes the tokenizer and defines an EmotionDataset wrapper that tokenizes text samples on the fly before instantiating dataset objects for each split.

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class EmotionDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_length: int = 128):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        encoded = self.tokenizer(
            row["text"],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in encoded.items()}
        item["labels"] = torch.tensor(row["label_id"], dtype=torch.long)
        return item

train_dataset = EmotionDataset(train_df, tokenizer)
val_dataset = EmotionDataset(val_df, tokenizer)
test_dataset = EmotionDataset(test_df, tokenizer)


Creates a classification configuration for DistilBERT and loads the base model with the updated label metadata.

In [10]:
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defines compute_metrics so the Trainer can report accuracy and macro/weighted F1 scores during evaluation.

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    metrics = {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }
    return metrics


### Trainer configuration
Adjust `per_device_train_batch_size`, `num_train_epochs`, or `learning_rate` for your hardware budget.

Configures TrainingArguments (batch sizes, epochs, evaluation cadence, etc.) and instantiates the Hugging Face Trainer with datasets and metric function.

In [9]:
training_args = TrainingArguments(
    output_dir=str(ARTIFACT_ROOT / "checkpoints"),
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


Optionally kicks off fine-tuning via trainer.train() and evaluates the pre-trained checkpoint to provide baseline metrics.

In [12]:
# Uncomment to launch full fine-tuning
trainer.train()

# Example: evaluate initial (pre-trained) checkpoint before fine-tuning
pretrain_metrics = trainer.evaluate()
print(pretrain_metrics)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.1225,0.185962,0.935,0.908392,0.93579
2,0.076,0.205876,0.9365,0.908339,0.936045
3,0.048,0.218917,0.9415,0.915821,0.941343




{'eval_loss': 0.21891732513904572, 'eval_accuracy': 0.9415, 'eval_f1_macro': 0.9158210253830864, 'eval_f1_weighted': 0.9413434674766833, 'eval_runtime': 8.3612, 'eval_samples_per_second': 239.201, 'eval_steps_per_second': 7.535, 'epoch': 3.0}


Runs final evaluation on the test split, then saves the fine-tuned model, tokenizer, and label mapping artifacts for reuse.

In [None]:
# After calling trainer.train(), run evaluation and save artifacts
eval_metrics = trainer.evaluate(test_dataset)
print("Test metrics:", eval_metrics)

trainer.save_model(ARTIFACT_ROOT / "hf_model")
tokenizer.save_pretrained(ARTIFACT_ROOT / "hf_model")

with open(ARTIFACT_ROOT / "label2id.json", "w", encoding="utf-8") as fp:
    json.dump(label2id, fp, indent=2)




Test metrics: {'eval_loss': 0.25555408000946045, 'eval_accuracy': 0.9265, 'eval_f1_macro': 0.880847681956671, 'eval_f1_weighted': 0.925841848245525, 'eval_runtime': 8.6512, 'eval_samples_per_second': 231.183, 'eval_steps_per_second': 7.282, 'epoch': 3.0}


## Embedding helper
Use the fine-tuned checkpoint to embed both prompts and lyrics via mean pooling of token embeddings.

Implements a lightweight TextEmotionEncoder wrapper that loads the fine-tuned model and exposes a mean-pooled encode() helper.

In [12]:
class TextEmotionEncoder(torch.nn.Module):
    def __init__(self, model_dir: Path, device: str | None = None):
        super().__init__()
        from transformers import AutoModel

        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.base_model = AutoModel.from_pretrained(model_dir).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)

    @torch.no_grad()
    def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        embeddings = []
        for start in range(0, len(texts), batch_size):
            batch = texts[start:start+batch_size]
            tokens = self.tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=256,
                return_tensors="pt",
            ).to(self.device)
            outputs = self.base_model(**tokens)
            token_embeddings = outputs.last_hidden_state
            attention_mask = tokens.attention_mask.unsqueeze(-1)
            summed = (token_embeddings * attention_mask).sum(dim=1)
            counts = attention_mask.sum(dim=1)
            mean_pooled = summed / counts
            embeddings.append(mean_pooled.cpu().numpy())
        return np.vstack(embeddings)


Demonstrates encoding the training prompts with the helper and persists the resulting embeddings for downstream notebooks.

In [15]:
# Example usage (requires fine-tuned weights saved above)
encoder = TextEmotionEncoder(ARTIFACT_ROOT / "hf_model")
prompt_embeddings = encoder.encode(train_df["text"].tolist())
np.save(ARTIFACT_ROOT / "train_prompt_embeddings.npy", prompt_embeddings)


#### Quick demo
Use a couple of sample prompts to verify the saved encoder produces fixed-size embeddings.

In [None]:
# Quick smoke test for the text encoder
if (ARTIFACT_ROOT / 'hf_model').exists():
    demo_encoder = TextEmotionEncoder(ARTIFACT_ROOT / 'hf_model')
    sample_prompts = [
        'I feel hopeful and excited about tomorrow',
    ]
    demo_embeddings = demo_encoder.encode(sample_prompts)
    print('Embeddings shape:', demo_embeddings.shape)
    print('Sample embeddings:', demo_embeddings[0][0])
else:
    print('⚠️ Fine-tuned weights not found; run trainer.train() and save artifacts first.')


Embeddings shape: (1, 768)
Sample embeddings: 0.81536853


### Next steps
- Run `trainer.train()` and persist the fine-tuned checkpoint once satisfied with metrics.
- Use `TextEmotionEncoder` to embed the prompt dataset (`train`, `val`, `test`) for analysis or curricula.
- Proceed to the audio encoder and multimodal fusion notebook after exporting `artifacts/text_encoder/hf_model`.