In [1]:
from pathlib import Path
from conceptnet import load_conceptnet

conceptnet = load_conceptnet(Path('/home/josh/data/conceptnet'))

Found filtered file at /home/josh/data/conceptnet/filtered.csv reading from it


100%|██████████| 3368477/3368477 [00:11<00:00, 296662.03it/s]

Loaded 1154522 concepts





In [2]:
key = '/c/en/cat'
word='cat'
concepts = conceptnet[key]
concepts

defaultdict(set,
            {'/r/AtLocation': {'/c/en/africa',
              '/c/en/alley',
              '/c/en/alleyway',
              '/c/en/ally',
              '/c/en/american_home',
              '/c/en/animal_rescue_shelter',
              '/c/en/animal_shops',
              '/c/en/apartment',
              '/c/en/appartment',
              '/c/en/appartments',
              '/c/en/arms',
              '/c/en/arms_of_human',
              '/c/en/art',
              '/c/en/attic',
              '/c/en/back_alley',
              '/c/en/back_yard',
              '/c/en/backyard',
              '/c/en/bag',
              '/c/en/barn',
              '/c/en/basement',
              '/c/en/be',
              '/c/en/beam_of_sunlight',
              '/c/en/bed',
              '/c/en/bedroom',
              '/c/en/best_chair_by_fire',
              '/c/en/box',
              '/c/en/bushes',
              '/c/en/cage',
              '/c/en/canada',
              '/c/en/canary_cage',
    

In [3]:
from lexical_similarity import SimilarityScorer

scorer = SimilarityScorer()
scorer.score_compounds_for_word(word, ['dog', 'fish', 'bird'])

array([0.41188788, 0.26615068, 0.32196143], dtype=float32)

In [4]:
from nltk import
def clean_filter_rank_concepts(base_word: str, concepts: list[str], scorer: SimilarityScorer) -> list[tuple[str, float]]:
    cleaned_iter = (
        concept[len('/c/en/'):] for concept in concepts
    )
    # TODO: there's definitely a better way to handle MWEs long term, if we want them (like averaging the embeddings)
    # right now we're just filtering them out
    filtered_cleaned = [concept for concept in cleaned_iter if '_' not in concept and concept in scorer]
    scores = scorer.score_compounds_for_word(base_word, filtered_cleaned)
    return sorted(zip(filtered_cleaned, scores), key=lambda x: x[1], reverse=True)

clean_filter_rank_concepts(word, concepts['/r/IsA'], scorer)

[('animal', 0.44521), ('talisman', 0.27815747), ('noun', 0.27663252)]

In [5]:
from itertools import combinations
from conceptnet import RELATION_TYPES

MAX_RELATION_COMBOS = 30
MAX_WORD_COMBOS = 100


def generate_diagonals(lhs_size: int, rhs_size: int):
    """
    Lazily generate the sequence for two lists of lengths n and m.
    Avoid duplicate pairs and generate in the desired order.
    """
    seen = set()  # To track generated pairs and avoid duplicates
    for d in range(lhs_size + rhs_size - 1):  # Diagonal levels
        for i in range(d + 1):
            j = d - i
            if i < lhs_size and j < rhs_size and (i, j) not in seen:  # Valid pair
                yield i, j
                seen.add((i, j))  # Mark as seen
            if j < lhs_size and i < rhs_size and (j, i) not in seen:  # Valid symmetric pair
                yield j, i
                seen.add((j, i))


def normalize_combination_score(score: int) -> float:
    mx, mn = max(RELATION_TYPES.values()), min(RELATION_TYPES.values())
    return 1 - ((score - mn) / (mx - mn))

combinations = sorted(
    ((lhs, rhs, normalize_combination_score(RELATION_TYPES[lhs] + RELATION_TYPES[rhs]))
    for lhs, rhs in combinations(RELATION_TYPES.keys(), 2)),
    key=lambda x: x[2], reverse=True
)[:MAX_RELATION_COMBOS]

total = sum(x[2] for x in combinations)

words_for_relation = dict()


used_word_combos = 0
for lhs, rhs, score in combinations:
    allocated_words = min(int(MAX_WORD_COMBOS * (score / total)), MAX_WORD_COMBOS - used_word_combos)
    if allocated_words == 0:
        break

    if lhs not in words_for_relation:
        words_for_relation[lhs] = clean_filter_rank_concepts(word, concepts[lhs], scorer)
    if rhs not in words_for_relation:
        words_for_relation[rhs] = clean_filter_rank_concepts(word, concepts[rhs], scorer)

    lhs_words = words_for_relation[lhs]
    rhs_words = words_for_relation[rhs]


    iterator = generate_diagonals(len(lhs_words), len(rhs_words))
    for i in range(allocated_words):
        lhs_idx, rhs_idx = next(iterator, (None, None))
        if lhs_idx is not None and rhs_idx is not None:
            used_word_combos += 1
            print((lhs_words[lhs_idx][0], rhs_words[rhs_idx][0]))


# take top 30 relation combos
# divide the number of candidates we generate per relation combo up by their relative score - that's the MAX for that relation combo
# keep going until we have 100 candidates

('animal', 'pet')
('animal', 'petting')
('talisman', 'pet')
('animal', 'fur')
('talisman', 'fur')
('noun', 'fur')
('pet', 'fur')
('petting', 'fur')
('companionship', 'fur')
('animal', 'pet')
('animal', 'miaow')
('talisman', 'pet')
('animal', 'vet')
('animal', 'cattery')
('talisman', 'vet')
('animal', 'feline')
('animal', 'mammal')
('talisman', 'feline')
('animal', 'cat')
('talisman', 'cat')
('noun', 'cat')
('pet', 'pet')
('pet', 'miaow')
('petting', 'pet')
('pet', 'vet')
('pet', 'cattery')
('petting', 'vet')
('pet', 'feline')
('pet', 'mammal')
('petting', 'feline')
('pet', 'cat')
('petting', 'cat')
('companionship', 'cat')
('pet', 'fur')
('miaow', 'fur')
('meow', 'fur')
('vet', 'fur')
('cattery', 'fur')
('catnip', 'fur')
