# KeyBERT with Sentiment-aware Embedding Fusion

### Setup: Installing and Importing Required Libraries

In [39]:
import subprocess
import sys

# List of required packages
required_packages = [
    "keybert", "sentence-transformers", "transformers", "torch", "numpy", "emoji", "tqdm",
    "vaderSentiment.vaderSentiment"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)

keybert is already installed.
Installing sentence-transformers...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
transformers is already installed.
torch is already installed.
numpy is already installed.
emoji is already installed.
tqdm is already installed.
vaderSentiment.vaderSentiment is already installed.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


# Class Definition

In [40]:
import torch
import numpy as np

# PyTorch neural network module — used to define the MLP that projects sentiment vectors
import torch.nn as nn

# SentenceTransformer is used to generate dense semantic embeddings for full documents or keywords
from sentence_transformers import SentenceTransformer

# HuggingFace Transformers: 
# - AutoTokenizer tokenizes input text for the sentiment model
# - AutoModelForSequenceClassification runs the sentiment classification model (e.g., RoBERTa)
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# PyTorch's LayerNorm is used for normalizing the output of the MLP
from torch.nn import LayerNorm 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [41]:
class SentimentEmbedderVADER(nn.Module):
    """
    Embedding model that fuses semantic embeddings with sentiment scores
    computed entirely using VADER for both documents and candidate phrases.

    Parameters:

    base_model : str
        SentenceTransformer model name for semantic embeddings (default "all-MiniLM-L6-v2").

    sentiment_mode : str
        How to handle sentiment scores:
        - "linear": use raw sentiment compound scores (scaled)
        - "nonlinear": project sentiment scores with MLP to semantic space

    combination_mode : str
        How to combine semantic and sentiment embeddings:
        - "concat", "add", "nonlinear"

    beta : float
        Scaling factor for sentiment influence.

    device : str
        Device for computation ("cpu" or "cuda").
    """

    def __init__(self,
                 base_model="all-MiniLM-L6-v2",
                 sentiment_mode="linear",
                 combination_mode="concat",
                 beta=0.5,
                 device="cpu"):
        super().__init__()

        self.device = device
        self.sentiment_mode = sentiment_mode
        self.combination_mode = combination_mode
        self.beta = beta

        self.base = SentenceTransformer(base_model, device=device)
        self.dim = self.base.get_sentence_embedding_dimension()

        self.vader_analyzer = SentimentIntensityAnalyzer()

        if sentiment_mode == "nonlinear":
            self.sent_proj = nn.Sequential(
                nn.Linear(1, 32),
                nn.ReLU(),
                nn.Linear(32, self.dim),
                nn.Tanh(),
                nn.LayerNorm(self.dim)
            ).to(device)

    @torch.no_grad()
    def _get_sentiment_score(self, texts):
        """
        Compute sentiment compound score from VADER for a list of texts.

        Returns tensor shape (batch_size, 1) normalized to [0,1] and scaled by beta.
        """
        scores = [self.vader_analyzer.polarity_scores(t)['compound'] for t in texts]
        norm_scores = [(s + 1) / 2 for s in scores]  # Normalize from [-1,1] to [0,1]
        return torch.tensor(norm_scores, dtype=torch.float32, device=self.device).unsqueeze(1) * self.beta

    def encode(self, texts, candidates=None, **kwargs):
        """
        Encode texts and optionally candidates with semantic + VADER sentiment embeddings.

        Returns:
            - If candidates is None: semantic + sentiment embeddings of texts
            - Else: tuple (text embeddings, candidate embeddings)
        """

        base_emb = self.base.encode(texts, convert_to_tensor=True, **kwargs).to(self.device)
        doc_sent = self._get_sentiment_score(texts)  # shape (batch, 1)

        if self.sentiment_mode == "nonlinear":
            doc_sent_emb = self.sent_proj(doc_sent)
        else:
            doc_sent_emb = doc_sent

        if candidates is None:
            if self.sentiment_mode == "linear" and self.combination_mode == "concat":
                return torch.cat([base_emb, doc_sent_emb], dim=1).cpu().numpy()
            elif self.sentiment_mode == "nonlinear":
                if self.combination_mode == "add":
                    return (base_emb + doc_sent_emb).cpu().numpy()
                elif self.combination_mode == "nonlinear":
                    return (base_emb + doc_sent_emb + base_emb * doc_sent_emb).cpu().numpy()
                elif self.combination_mode == "concat":
                    return torch.cat([base_emb, doc_sent_emb], dim=1).cpu().numpy()
            else:
                return base_emb.cpu().numpy()

        # Compute candidate embeddings + sentiment
        cand_emb = self.base.encode(candidates, convert_to_tensor=True, **kwargs).to(self.device)
        cand_sent = self._get_sentiment_score(candidates)

        if self.sentiment_mode == "nonlinear":
            cand_sent_emb = self.sent_proj(cand_sent)
        else:
            cand_sent_emb = cand_sent

        if self.sentiment_mode == "linear" and self.combination_mode == "concat":
            combined_cand_emb = torch.cat([cand_emb, cand_sent_emb], dim=1)
        elif self.sentiment_mode == "nonlinear":
            if self.combination_mode == "add":
                combined_cand_emb = cand_emb + cand_sent_emb
            elif self.combination_mode == "nonlinear":
                combined_cand_emb = cand_emb + cand_sent_emb + cand_emb * cand_sent_emb
            elif self.combination_mode == "concat":
                combined_cand_emb = torch.cat([cand_emb, cand_sent_emb], dim=1)
        else:
            combined_cand_emb = cand_emb

        return base_emb.cpu().numpy(), combined_cand_emb.cpu().numpy()


# Tests

### Test 1

In [42]:
from keybert import KeyBERT

In [44]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

def compare_keyword_outputs(doc, top_n=5, beta=10.0, device="cpu"):
    """
    Compare keyword extraction outputs between base KeyBERT and all sentiment-aware configurations.
    """

    print("=== DOCUMENT ===")
    print(doc)
    print()

    # Run KeyBERT base (semantic-only)
    print("=== BASE KeyBERT (semantic-only) ===")
    base_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
    kw_base = KeyBERT(model=base_model)
    keywords = kw_base.extract_keywords(doc, top_n=top_n)
    for kw, score in keywords:
        print(f"{kw:15s} score: {score:.4f}")
    print()

    # Define combinations to test
    sentiment_modes = ["linear", "nonlinear"]
    combination_modes = ["concat", "add", "nonlinear"]

    print("=== SENTIMENT-AWARE KeyBERT ===")
    for sent_mode in sentiment_modes:
        for comb_mode in combination_modes:

            # Skip invalid: linear + add or nonlinear
            if sent_mode == "linear" and comb_mode != "concat":
                continue

            print(f"[sentiment_mode = '{sent_mode}', combination_mode = '{comb_mode}']")

            try:
                # Instantiate SentimentEmbedder with VADER integration
                sent_model = SentimentEmbedderVADER(
                    sentiment_mode=sent_mode,
                    combination_mode=comb_mode,
                    beta=beta,
                    device=device
                )

                kw_model = KeyBERT(model=sent_model)
                keywords = kw_model.extract_keywords(doc, top_n=top_n)

                for kw, score in keywords:
                    print(f"{kw:15s} score: {score:.4f}")

            except Exception as e:
                print(f"ERROR: {type(e).__name__}: {e}")

            print()

# Example usage with your document:
doc = "This film was absolutely amazing. The story was heartfelt, the acting superb, and the visuals breathtaking."
compare_keyword_outputs(doc)


=== DOCUMENT ===
This film was absolutely amazing. The story was heartfelt, the acting superb, and the visuals breathtaking.

=== BASE KeyBERT (semantic-only) ===
breathtaking    score: 0.4218
superb          score: 0.3783
heartfelt       score: 0.3348
film            score: 0.3296
acting          score: 0.2912

=== SENTIMENT-AWARE KeyBERT ===
[sentiment_mode = 'linear', combination_mode = 'concat']
breathtaking    score: 0.5335
amazing         score: 0.4463
superb          score: 0.4017
film            score: 0.3563
story           score: 0.2930

[sentiment_mode = 'nonlinear', combination_mode = 'concat']
breathtaking    score: 0.5335
amazing         score: 0.4463
superb          score: 0.4017
film            score: 0.3563
story           score: 0.2930

[sentiment_mode = 'nonlinear', combination_mode = 'add']
breathtaking    score: 0.5335
amazing         score: 0.4463
superb          score: 0.4017
film            score: 0.3563
story           score: 0.2930

[sentiment_mode = 'nonlinea