<a href="https://colab.research.google.com/github/IsaacFigNewton/SMIED/blob/main/BFS_Semantic_Decomposition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Semantic decomposition of ("cat", "eats", "mouse") using WordNet + spaCy + depth-limited GBFS.
- Uses spaCy to parse verb synset glosses and detect subject/object dependencies.
- If both subject and object tokens are present, branches directly toward original triple synsets.
- Otherwise falls back to WordNet relations.
"""

import nltk
import spacy
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from heapq import heappush, heappop

In [2]:
nltk.download('wordnet')

# Load spaCy English model for dependency parsing
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
from collections import deque
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
import spacy

# Initialize spaCy (assuming you have it loaded)
# nlp = spacy.load("en_core_web_sm")

def path_syn_to_syn(start_synset, end_synset, max_depth=6):
    """
    Find shortest path between synsets of the same POS using bidirectional BFS.
    Returns a list of synsets forming the path, or None if no path found.
    """

    assert start_synset.pos() == end_synset.pos() and start_synset.pos() in {'n', 'v'}

    # Handle the trivial case where start and end are the same
    if start_synset.name() == end_synset.name():
        return [start_synset]

    # Initialize two search frontiers
    # Forward search (from start)
    forward_queue = deque([(start_synset, 0)])  # (synset, depth)
    forward_visited = {start_synset.name(): [start_synset]}  # Maps synset name to path from start

    # Backward search (from end)
    backward_queue = deque([(end_synset, 0)])  # (synset, depth)
    backward_visited = {end_synset.name(): [end_synset]}  # Maps synset name to path from end

    def get_neighbors(synset):
        """Helper function to get all neighbors of a synset."""
        neighbors = []

        # Add hypernyms and hyponyms
        neighbors.extend(synset.hypernyms())
        neighbors.extend(synset.hyponyms())

        # Add POS-specific neighbors
        if synset.pos() == 'n':
            neighbors.extend(get_noun_neighbors(synset))
        else:
            neighbors.extend(get_verb_neighbors(synset))

        return neighbors

    def expand_frontier(queue, visited_from_this_side, visited_from_other_side, is_forward):
        """
        Expand one step of the search frontier.
        Returns the complete path if intersection found, None otherwise.
        """
        if not queue:
            return None

        curr_synset, depth = queue.popleft()

        # Don't expand beyond max_depth/2 for each direction
        if depth >= (max_depth + 1) // 2:
            return None

        # Get the path to current synset
        path_to_current = visited_from_this_side[curr_synset.name()]

        # Explore neighbors
        for neighbor in get_neighbors(curr_synset):
            neighbor_name = neighbor.name()

            # If we've already visited this node from this direction, skip it
            if neighbor_name in visited_from_this_side:
                continue

            # Create the path to this neighbor
            if is_forward:
                new_path = path_to_current + [neighbor]
            else:
                new_path = [neighbor] + path_to_current

            # Check if we've found an intersection with the other search
            if neighbor_name in visited_from_other_side:
                # We found a meeting point! Reconstruct the full path
                other_path = visited_from_other_side[neighbor_name]

                if is_forward:
                    # Forward path + backward path (reversed, excluding the meeting point)
                    full_path = path_to_current + other_path
                else:
                    # Forward path + backward path (excluding the meeting point)
                    full_path = other_path + path_to_current

                return full_path

            # Add to visited and queue
            visited_from_this_side[neighbor_name] = new_path
            queue.append((neighbor, depth + 1))

        return None

    # Alternate between forward and backward search
    while forward_queue or backward_queue:
        # Expand forward frontier
        if forward_queue:
            result = expand_frontier(forward_queue, forward_visited, backward_visited, is_forward=True)
            if result:
                return result

        # Expand backward frontier
        if backward_queue:
            result = expand_frontier(backward_queue, backward_visited, forward_visited, is_forward=False)
            if result:
                return result

    return None  # No path found


def get_noun_neighbors(syn):
    """
    Get neighbors for a noun synset.
    """
    nbrs = set()
    nbrs.update(syn.part_meronyms())
    nbrs.update(syn.substance_meronyms())
    nbrs.update(syn.member_meronyms())
    nbrs.update(syn.part_holonyms())
    nbrs.update(syn.substance_holonyms())
    nbrs.update(syn.member_holonyms())
    return list(nbrs)


def get_verb_neighbors(syn):
    """
    Get neighbors for a verb synset.
    """
    nbrs = set()
    nbrs.update(syn.entailments())
    nbrs.update(syn.causes())
    nbrs.update(syn.also_sees())
    nbrs.update(syn.verb_groups())
    return list(nbrs)


def cross_pos_path(start_synset, end_synset, max_depth=6):
    """
    Find path between synsets of different POS using gloss analysis.
    Returns a list of synsets forming the path, or None if no path found.
    """

    assert start_synset.pos() != end_synset.pos() and start_synset.pos() in {'n', 'v'}

    # If start is a noun and end is a verb (subject -> predicate)
    if start_synset.pos() == 'n':
        # Strategy 1: Look for subject references in the verb's gloss
        pred_gloss_doc = nlp(end_synset.definition())
        subjs = [tok for tok in pred_gloss_doc if tok.dep_ == "nsubj"]

        if subjs:
            try:
                subject_synset = lesk(pred_gloss_doc.text, subjs[0].text, pos='n')
                if subject_synset:
                    # Find path from start to the subject mentioned in verb's gloss
                    path = path_syn_to_syn(start_synset, subject_synset, max_depth=max_depth)
                    if path:
                        # Add the verb at the end to complete the cross-POS path
                        return path + [end_synset]
            except:
                pass

        # Strategy 2: Look for verb references in the noun's gloss
        subj_gloss_doc = nlp(start_synset.definition())
        preds = [tok for tok in subj_gloss_doc if tok.pos_ == "VERB"]

        if preds:
            try:
                subj_pred_synset = lesk(subj_gloss_doc.text, preds[0].text, pos='v')
                if subj_pred_synset:
                    # Find path from verb found in subj gloss to the pred synset
                    path = path_syn_to_syn(subj_pred_synset, end_synset, max_depth=max_depth)
                    if path:
                        # Prepend the noun to complete the cross-POS path
                        return [start_synset] + path
            except:
                pass

    # If start is a verb and end is a noun (predicate -> object)
    elif start_synset.pos() == 'v':
        # Strategy 1: Look for direct objects in the verb's gloss
        pred_gloss_doc = nlp(start_synset.definition())

        # Expand to look for multiple types of objects
        objs = []
        # Direct objects
        objs.extend([tok for tok in pred_gloss_doc if tok.dep_ == "dobj"])
        # Prepositional objects (e.g., "eat at a table")
        objs.extend([tok for tok in pred_gloss_doc if tok.dep_ == "pobj"])
        # Indirect objects (e.g., "give someone something")
        objs.extend([tok for tok in pred_gloss_doc if tok.dep_ == "iobj"])
        # Objects of prepositions that relate to the main verb
        objs.extend([tok for tok in pred_gloss_doc if tok.dep_ == "obj"])

        # Also look for noun chunks that might be objects
        if not objs:
            # Look for nouns that are children of the root verb
            root_verbs = [tok for tok in pred_gloss_doc if tok.dep_ == "ROOT" and tok.pos_ == "VERB"]
            if root_verbs:
                for noun_chunk in pred_gloss_doc.noun_chunks:
                    # Check if this noun chunk is related to the main verb
                    if any(token.head == root_verbs[0] for token in noun_chunk):
                        objs.append(noun_chunk.root)

        if objs:
            # Try multiple object candidates
            for obj in objs[:3]:  # Limit to first 3 to avoid excessive computation
                try:
                    object_synset = lesk(pred_gloss_doc.text, obj.text, pos='n')
                    if object_synset:
                        # Find path from object mentioned in verb's gloss to end
                        path = path_syn_to_syn(object_synset, end_synset, max_depth=max_depth)
                        if path:
                            # Add the verb at the beginning to complete the cross-POS path
                            return [start_synset] + path
                except:
                    continue

        # Strategy 2: Look for verb references in the object's gloss
        obj_gloss_doc = nlp(end_synset.definition())

        # Look for verbs that might describe actions done to/with this object
        verbs = [tok for tok in obj_gloss_doc if tok.pos_ == "VERB"]

        # Also check if the object is described as something that gets verbed
        # (e.g., "food: something that is eaten")
        passive_verbs = [tok for tok in obj_gloss_doc if tok.tag_ in ["VBN", "VBD"] and tok.dep_ in ["acl", "relcl"]]
        verbs.extend(passive_verbs)

        if verbs:
            # Try multiple verb candidates
            for verb in verbs[:3]:  # Limit to first 3
                try:
                    obj_verb_synset = lesk(obj_gloss_doc.text, verb.text, pos='v')
                    if obj_verb_synset:
                        # Find path from start verb to verb found in object's gloss
                        path = path_syn_to_syn(start_synset, obj_verb_synset, max_depth=max_depth)
                        if path:
                            # Append the noun to complete the cross-POS path
                            return path + [end_synset]
                except:
                    continue

        # Strategy 3: Check for "used for" or "used in" relationships in object's gloss
        # This helps with instrumental objects (e.g., "fork" used for "eating")
        if "used" in obj_gloss_doc.text.lower():
            # Look for verbs following "used for/to/in"
            for i, token in enumerate(obj_gloss_doc):
                if token.text.lower() == "used":
                    # Check tokens after "used"
                    for j in range(i+1, min(i+4, len(obj_gloss_doc))):
                        if obj_gloss_doc[j].pos_ == "VERB":
                            try:
                                use_verb_synset = lesk(obj_gloss_doc.text, obj_gloss_doc[j].text, pos='v')
                                if use_verb_synset:
                                    path = path_syn_to_syn(start_synset, use_verb_synset, max_depth=max_depth)
                                    if path:
                                        return path + [end_synset]
                            except:
                                continue

    # If gloss-based approach fails, return None
    return None


def find_shortest_paths(subject_word, predicate_word, object_word, max_depth=10):
    """
    Find shortest paths from subject to predicate and predicate to object.
    """

    # Get synsets for each word
    subject_synsets = wn.synsets(subject_word, pos=wn.NOUN)
    predicate_synsets = wn.synsets(predicate_word, pos=wn.VERB)
    object_synsets = wn.synsets(object_word, pos=wn.NOUN)

    # Find all possible paths from subject to predicate
    subject_paths = []
    for subj in subject_synsets:
        for pred in predicate_synsets:
            path = cross_pos_path(subj, pred, max_depth=max_depth)
            if path:
                subject_paths.append(path)

    # Find all possible paths from predicate to object
    object_paths = []
    for pred in predicate_synsets:
        for obj in object_synsets:
            path = cross_pos_path(pred, obj, max_depth=max_depth)
            if path:
                object_paths.append(path)

    # Get the shortest paths (if any found)
    shortest_subject_path = min(subject_paths, key=len) if subject_paths else None
    shortest_object_path = min(object_paths, key=len) if object_paths else None

    return shortest_subject_path, shortest_object_path


def show_path(label, path):
    """
    Pretty print a path of synsets.
    """
    if path:
        print(f"{label}:")
        print(" -> ".join(f"{s.name()} ({s.definition()})" for s in path))
        print(f"Path length: {len(path)}")
        print()
    else:
        print(f"{label}: No path found")
        print()

In [6]:
nlp = spacy.load("en_core_web_sm")

# Find shortest paths
cat_path, mouse_path = find_shortest_paths("cat", "eat", "mouse", max_depth=10)

# Display results
show_path("Path from 'cat' to 'eat' (subject -> predicate)", cat_path)
show_path("Path from 'eat' to 'mouse' (predicate -> object)", mouse_path)

Path from 'cat' to 'eat' (subject -> predicate):
kat.n.01 (the leaves of the shrub Catha edulis which are chewed like tobacco or used to make tea; has the effect of a euphoric stimulant) -> chew.v.01 (chew (food); to bite and grind with the teeth) -> eat.v.01 (take in solid food)
Path length: 3

Path from 'eat' to 'mouse' (predicate -> object):
corrode.v.01 (cause to deteriorate due to the action of water, air, or an acid) -> natural_process.n.01 (a process existing in or produced by nature (rather than by the intent of human beings)) -> process.n.06 (a sustained phenomenon or one marked by gradual changes through a series of states) -> physical_entity.n.01 (an entity that has physical existence) -> causal_agent.n.01 (any entity that produces an effect or is responsible for events or results) -> person.n.01 (a human being) -> mouse.n.03 (person who is quiet or timid)
Path length: 7

