<a href="https://colab.research.google.com/github/IsaacFigNewton/DisCoFuzz/blob/main/Compositional_Model_MVP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import and Config

In [6]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

from sentence_transformers import SentenceTransformer
import numpy as np
import spacy
from typing import Optional, Iterable

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# Load transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Helpers

In [30]:
def build_wordnet_lemma_embeddings(
    batch_size: int = 64
):
    """
    Loads all WordNet lemmas, embeds them using a SentenceTransformer model,
    and returns a dictionary mapping lemma → embedding vector.

    Args:
        batch_size: Batch size for model.encode().

    Returns:
        dict[str, np.ndarray]: mapping from lemma string to embedding vector.
    """
    # Collect all lemmas (set removes duplicates)
    lemma_set = set()
    for syn in wn.all_synsets():
        for lemma in syn.lemmas():
            lemma_set.add(lemma.name().replace("_", " "))  # normalize underscore → space

    lemma_list = sorted(lemma_set)

    # Encode lemmas in batches
    embeddings = model.encode(
        lemma_list,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )

    # Build dictionary
    lemma_to_vec = {lemma: emb for lemma, emb in zip(lemma_list, embeddings)}

    return lemma_to_vec

In [52]:
def _is_ignored(token):
    # ignore punctuation and determiners
    return token.is_punct or token.pos_ == "DET"

def embed_doc_by_dependency(
    text: str,
    keyed_vectors: Optional[dict],
):
    """
    Produce a single embedding for `text` by:
      - running spaCy (small model expected) to get dependency tree
      - looking up token vectors in `keyed_vectors` (gensim KeyedVectors) when provided,
        otherwise using spaCy token.vector if available
      - ignoring punctuation and determiners
      - recursively computing node embeddings as mean([node_vector_if_not_ignored] + children_branch_embeddings)
      - aggregating sentence roots by mean for the final document vector

    Args:
      text: input document string
      nlp: optional loaded spaCy model (if None, will load "en_core_web_sm")
      keyed_vectors: lemmas' embeddings

    Returns:
      np.ndarray of shape (embedding_dim,)
    """
    nlp = spacy.load("en_core_web_sm")

    doc = nlp(text)

    # Cache for node embeddings to avoid recomputation
    cache = {}

    def node_embedding(token):
        """Recursively compute embedding for the branch rooted at `token`."""
        if token.i in cache:
            return cache[token.i]

        collected = []

        # If this token is not ignored, include its vector
        if not _is_ignored(token):
            vec = keyed_vectors.get(token.lemma_.lower())
            if vec is not None: # Added check to ensure vec is not None
                collected.append(np.asarray(vec, dtype=float))

        # Recurse into children
        for child in token.children:
            child_emb = node_embedding(child)
            if child_emb is not None:
                collected.append(child_emb)

        if len(collected) == 0:
            result = None
        else:
            # mean across collected vectors
            # Removed the incorrect filter `if c.any(np.nan)`
            stacked = np.vstack(collected)
            result = stacked.mean(axis=0)

        cache[token.i] = result
        return result

    # For each sentence, take its syntactic root(s). For the doc, average the roots' embeddings.
    root_embeddings = []
    for sent in doc.sents:
        root = sent.root
        emb = node_embedding(root)
        if emb is not None:
            root_embeddings.append(emb)

    if len(root_embeddings) == 0:
        return np.zeros(384)

    root_stack = np.vstack(root_embeddings)
    doc_embedding = root_stack.mean(axis=0)
    return doc_embedding

# Test embedding composition

In [40]:
keyed_vectors = build_wordnet_lemma_embeddings()

Batches:   0%|          | 0/2324 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [55]:
v1 = embed_doc_by_dependency("the quick brown fox jumps over the lazy dog.", keyed_vectors)
v2 = embed_doc_by_dependency("the quick brown dog jumps over the lazy fox.", keyed_vectors)
v3 = embed_doc_by_dependency("alice loves bob.", keyed_vectors)

In [56]:
from sklearn.metrics.pairwise import cosine_similarity

print(f"similarity of similar sentences: {cosine_similarity([v1], [v2])}")
print(f"similarity of dissimilar sentences: {cosine_similarity([v1], [v3])}")

similarity of similar sentences: [[0.99892119]]
similarity of dissimilar sentences: [[0.54916776]]
