In [30]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn, stopwords
from nltk.corpus.reader import Synset
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import spacy
from spacy.tokens import Doc

import numpy as np
from collections import Counter
from typing import Dict, List, Tuple

In [18]:
nltk.download('wordnet')
embedder = spacy.load("en_core_web_md")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
definitions = pd.read_csv('resources/definitions.tsv', sep='\t')
definitions.head()

# remove index from the dataframe (for each row it is the first element)
definitions = definitions.iloc[:, 1:]
definitions.head()

Unnamed: 0,door,ladybug,pain,blurriness
0,"A construction used to divide two rooms, tempo...","small flying insect, typically red with black ...",A feeling of physical or mental distress,sight out of focus
1,"It's an opening, it can be opened or closed.","It is an insect, it has wings, red with black ...","It is a feeling, physical or emotional. It is ...","It is the absence of definite borders, shapele..."
2,"An object that divide two room, closing an hol...",An insect that can fly. It has red or orange c...,A felling that couscious beings can experince ...,A sensation felt when you can't see clearly th...
3,Usable for access from one area to another,Small insect with a red back,Concept that describes a suffering living being,Lack of sharpness
4,Structure that delimits an area and allows acc...,Small round flying insect,Feeling of physical discomfort,Characteristic of lack of clarity or precision


In [4]:
# convert the dataframe to a dictionary for easier access
definitions_dict: Dict[str, List[str]] = {}
for column in definitions.columns:
    definitions_dict[column] = definitions[column].tolist()

In [5]:
# print every word and one of its definitions
for word in definitions_dict:
    print(f'- {word.upper()}: \n\t{definitions_dict[word][0]}')

- DOOR: 
	A construction used to divide two rooms, temporarily closing the passage between them
- LADYBUG: 
	small flying insect, typically red with black spots with six legs
- PAIN: 
	A feeling of physical or mental distress
- BLURRINESS: 
	sight out of focus


### Function to calculate the signature of the definitions

For each word, the signature is a list of words that are present in the definitions of the word. The words are lemmatized and the punctuation and stop words are removed.

In [6]:
lemmatizer = WordNetLemmatizer()

def clean_sentence(sentence: str) -> List[str]:
    # convert to list of words
    word_list = sentence.split()
    # convert to lower case
    word_list = [word.lower() for word in word_list]
    # tokenize the words and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = tokenizer.tokenize(' '.join(word_list))
    # remove stop words using the nltk stop words list
    word_list = [word for word in word_list if word not in stopwords.words('english')]
    # lemmatize the words
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    return word_list

In [7]:
# extract synset of 10 most frequent words in definitions
def get_synsets_for_word(signature: List[str]) -> List[Synset]:
    # calculate words frequency for a definition signature
    word_frequency = Counter(signature)
    most_common_words: List[Tuple] = word_frequency.most_common(10)
    
    # get 10 most common nouns from most common words
    most_common_nouns = [word_tuple for word_tuple in most_common_words if wn.synsets(word_tuple[0]) and wn.synsets(word_tuple[0])[0].pos() == 'n']
    
    print(most_common_nouns)
    
    # get the synsets for the most common nouns
    synsets: List[Synset] = [wn.synsets(word_tuple[0])[0] for word_tuple in most_common_nouns]
    return synsets

In [8]:
door_def_1 = definitions_dict['door'][0]
signature = clean_sentence(door_def_1)
print(signature)
get_synsets_for_word(signature)

['construction', 'used', 'divide', 'two', 'room', 'temporarily', 'closing', 'passage']
[('construction', 1), ('divide', 1), ('two', 1), ('room', 1), ('closing', 1), ('passage', 1)]


[Synset('construction.n.01'),
 Synset('divide.n.01'),
 Synset('two.n.01'),
 Synset('room.n.01'),
 Synset('shutting.n.01'),
 Synset('passage.n.01')]

## TODO

Starting from the synsets of the most common words in the definitions, navigate the WordNet graph to find the synset with maximum overlap between definition signature and synset signature.

Approach 1: find the synset with maximum overlap between definition signature and synset signature

In [9]:
def get_synset_signature(synset: Synset) -> List[str]:
    gloss: List[str] = synset.definition().split()
    examples: List[str] = " ".join(synset.examples()).split()
    synset_signature = clean_sentence(" ".join(gloss + examples))
    return synset_signature

def get_max_overlap_synset(definition: str) -> Synset:
    max_overlap = 0
    max_overlap_synset = None
    signature = clean_sentence(definition)
    # for each synset in wordnet
    for synset in wn.all_synsets():
        # get the signature of the synset
        synset_signature = get_synset_signature(synset)
        # calculate the overlap between the definition signature and the synset signature
        overlap = len(set(signature).intersection(set(synset_signature)))
        # if the overlap is greater than the maximum overlap, update the maximum overlap and the synset
        if overlap > max_overlap:
            max_overlap = overlap
            max_overlap_synset = synset
    return max_overlap_synset

In [10]:
door_synset = get_max_overlap_synset(door_def_1)
print(door_synset)

Synset('caterpillar.n.02')


In [11]:
print(door_synset.definition())

a large tracked vehicle that is propelled by two endless metal belts; frequently used for moving earth in construction and farm work


Approach 2: starting from the most general name synset explore the graph following the branch with maximum cosine similarity between the definition signature and the synset.

In [45]:
def embed_sentence(sentence: str) -> Doc:
    sentence = " ".join(clean_sentence(sentence))
    return embedder(sentence)

Vector for 'A construction used to divide two rooms, temporarily closing the passage between them':
construction used divide two room temporarily closing passage


In [79]:
def get_synset_embedding(synset: Synset) -> Doc:    
    # Concatenate the lemma names to form a text representation of the synset
    synset_lemmas = ' '.join(lemma.name().replace('_', ' ') for lemma in synset.lemmas())
    synset_gloss = synset.definition()
    synset_examples = ' '.join(example for example in synset.examples())
    synset_signature = synset_lemmas + ' ' + synset_gloss + ' ' + synset_examples

    # Create a Doc from the synset text
    synset_doc = embedder(" ".join(clean_sentence(synset_signature)))

    return synset_doc

In [41]:
door_synset = wn.synsets('door')[0]
door_synset_embedding = get_synset_embedding(door_synset)

# cosine similarity between the word and the synset
print(door_synset_embedding.similarity(embedder('door')))
print(door_synset_embedding.similarity(embedder('penguin')))

1.0
0.08536926363462019


In [74]:
def find_best_synset(definition: str) -> Synset:
    # Compute the target definition embedding
    target_doc = embed_sentence(definition)

    # synset for the current node in the graph, initialized to the most general name synset
    current_synset = wn.synset('entity.n.01')
    # flag to check if the current synset is a leaf
    reached_leaf = False
    # holds the highest similarity between the hyponyms of current synset and the target definition
    highest_similarity = 0
    # similarity between the current synset and the target definition
    previous_similarity = -1
    # similarity between the target definition and the current synset
    current_similarity = 0
    # hyponym with the highest similarity to the target definition
    best_hyponym = None
    
    threshold = 0.001

    while not reached_leaf and current_similarity > previous_similarity + threshold:
        print(current_synset)
        
        # Get the hyponyms of the current synset
        hyponyms = current_synset.hyponyms()

        # If the current synset has no hyponyms, set the reached_leaf flag to True
        if not hyponyms:
            reached_leaf = True
        else:
            for hyponym in hyponyms:
                # Get the embedding of the hyponym
                hyponym_embedding = get_synset_embedding(hyponym)
        
                # Compute the similarity between the target definition and the current hyponym
                hyponym_similarity = target_doc.similarity(hyponym_embedding)
                
                # If the similarity is greater than the highest similarity, update the highest similarity and the current synset
                if hyponym_similarity > highest_similarity:
                    highest_similarity = hyponym_similarity
                    best_hyponym = hyponym
            # now we found the best hyponym for the current synset
            previous_similarity = current_similarity
            current_synset = best_hyponym
            current_similarity = highest_similarity

    return current_synset

In [80]:
door_synset = find_best_synset(definitions_dict['door'][3])
print(door_synset)

Synset('entity.n.01')
Synset('abstraction.n.06')
Synset('relation.n.01')
Synset('connection.n.01')
Synset('communication.n.03')
Synset('communication.n.03')
