In [128]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn, stopwords
from nltk.corpus.reader import Synset
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import spacy
from spacy.tokens import Doc

import numpy as np
from collections import Counter
from typing import Dict, List, Tuple

In [129]:
nltk.download('wordnet')
# spacy.cli.download("en_core_web_md")
embedder = spacy.load("en_core_web_md")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gianl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [130]:
definitions = pd.read_csv('resources/definitions.tsv', sep='\t')
definitions.head()

# remove index from the dataframe (for each row it is the first element)
definitions = definitions.iloc[:, 1:]
definitions.head()

Unnamed: 0,door,ladybug,pain,blurriness
0,"A construction used to divide two rooms, tempo...","small flying insect, typically red with black ...",A feeling of physical or mental distress,sight out of focus
1,"It's an opening, it can be opened or closed.","It is an insect, it has wings, red with black ...","It is a feeling, physical or emotional. It is ...","It is the absence of definite borders, shapele..."
2,"An object that divide two room, closing an hol...",An insect that can fly. It has red or orange c...,A felling that couscious beings can experince ...,A sensation felt when you can't see clearly th...
3,Usable for access from one area to another,Small insect with a red back,Concept that describes a suffering living being,Lack of sharpness
4,Structure that delimits an area and allows acc...,Small round flying insect,Feeling of physical discomfort,Characteristic of lack of clarity or precision


In [131]:
# convert the dataframe to a dictionary for easier access
definitions_dict: Dict[str, List[str]] = {}
for column in definitions.columns:
    definitions_dict[column] = definitions[column].tolist()

In [132]:
# print every word and one of its definitions
for word in definitions_dict:
    print(f'- {word.upper()}: \n\t{definitions_dict[word][0]}')

- DOOR: 
	A construction used to divide two rooms, temporarily closing the passage between them
- LADYBUG: 
	small flying insect, typically red with black spots with six legs
- PAIN: 
	A feeling of physical or mental distress
- BLURRINESS: 
	sight out of focus


### Function to calculate the signature of the definitions

For each word, the signature is a list of words that are present in the definitions of the word. The words are lemmatized and the punctuation and stop words are removed.

In [133]:
lemmatizer = WordNetLemmatizer()

def clean_sentence(sentence: str) -> List[str]:
    # convert to list of words
    word_list = sentence.split()
    # convert to lower case
    word_list = [word.lower() for word in word_list]
    # tokenize the words and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = tokenizer.tokenize(' '.join(word_list))
    # remove stop words using the nltk stop words list
    word_list = [word for word in word_list if word not in stopwords.words('english')]
    # lemmatize the words
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    return word_list

In [134]:
# extract synset of 10 most frequent words in definitions
def get_synsets_for_word(signature: List[str]) -> List[Synset]:
    # calculate words frequency for a definition signature
    word_frequency = Counter(signature)
    most_common_words: List[Tuple] = word_frequency.most_common(10)
    
    # get 10 most common nouns from most common words
    most_common_nouns = [word_tuple for word_tuple in most_common_words if wn.synsets(word_tuple[0]) and wn.synsets(word_tuple[0])[0].pos() == 'n']
    
    print(most_common_nouns)
    
    # get the synsets for the most common nouns
    synsets: List[Synset] = [wn.synsets(word_tuple[0])[0] for word_tuple in most_common_nouns]
    return synsets

In [135]:
door_def_1 = definitions_dict['door'][0]
signature = clean_sentence(door_def_1)
print(signature)
get_synsets_for_word(signature)

['construction', 'used', 'divide', 'two', 'room', 'temporarily', 'closing', 'passage']
[('construction', 1), ('divide', 1), ('two', 1), ('room', 1), ('closing', 1), ('passage', 1)]


[Synset('construction.n.01'),
 Synset('divide.n.01'),
 Synset('two.n.01'),
 Synset('room.n.01'),
 Synset('shutting.n.01'),
 Synset('passage.n.01')]

### Testing method
Takes a callable function and applies it to the word definitions to obtain the initial word

In [136]:
def test_method(method):
    for word in definitions_dict:
        print(f'- {word.upper()}:')
        for definition in definitions_dict[word]:
            print(f'\t- {definition}')
            word_obtained = method(definition)
            print(f'\t\t- {word_obtained}')

Approach 1: find the synset with maximum overlap between definition signature and synset signature

In [137]:
def get_synset_signature(synset: Synset) -> List[str]:
    gloss: List[str] = synset.definition().split()
    examples: List[str] = " ".join(synset.examples()).split()
    synset_signature = clean_sentence(" ".join(gloss + examples))
    return synset_signature

def get_max_overlap_synset(definition: str) -> Synset:
    max_overlap = 0
    max_overlap_synset = None
    signature = clean_sentence(definition)
    # for each synset in wordnet
    for synset in wn.all_synsets():
        # get the signature of the synset
        synset_signature = get_synset_signature(synset)
        # calculate the overlap between the definition signature and the synset signature
        overlap = len(set(signature).intersection(set(synset_signature)))
        # if the overlap is greater than the maximum overlap, update the maximum overlap and the synset
        if overlap > max_overlap:
            max_overlap = overlap
            max_overlap_synset = synset
    return max_overlap_synset

In [138]:
# test_method(get_max_overlap_synset)

Approach 2: starting from the most general name synset explore the graph following the branch with maximum similarity between the definition signature and the synset.

In [139]:
def embed_sentence(sentence: str) -> Doc:
    sentence = " ".join(clean_sentence(sentence))
    return embedder(sentence)

In [140]:
def get_synset_embedding(synset: Synset) -> Doc:    
    # Concatenate the lemma names to form a text representation of the synset
    synset_lemmas = ' '.join(lemma.name().replace('_', ' ') for lemma in synset.lemmas())
    synset_gloss = synset.definition()
    synset_examples = ' '.join(example for example in synset.examples())
    synset_signature = synset_lemmas + ' ' + synset_gloss + ' ' + synset_examples

    # Create a Doc from the synset text
    synset_doc = embedder(" ".join(clean_sentence(synset_signature)))

    return synset_doc

In [141]:
door_synset = wn.synsets('door')[0]
door_synset_embedding = get_synset_embedding(door_synset)

# cosine similarity between the word and the synset
print(door_synset_embedding.similarity(embedder('door')))
print(door_synset_embedding.similarity(embedder('penguin')))

0.8198888470468051
0.1265559035095978


In [142]:
def find_best_synset(definition: str) -> Synset:
    # Compute the target definition embedding
    target_doc = embed_sentence(definition)

    # synset for the current node in the graph, initialized to the most general name synset
    current_synset = wn.synset('entity.n.01')
    # flag to check if the current synset is a leaf
    reached_leaf = False
    # holds the highest similarity between the hyponyms of current synset and the target definition
    highest_similarity = 0
    # similarity between the current synset and the target definition
    previous_similarity = -1
    # similarity between the target definition and the current synset
    current_similarity = 0
    # hyponym with the highest similarity to the target definition
    best_hyponym = None
    
    threshold = 0.00000001

    while not reached_leaf and current_similarity >= previous_similarity + threshold:
        # Get the hyponyms of the current synset
        hyponyms = current_synset.hyponyms()

        # If the current synset has no hyponyms, set the reached_leaf flag to True
        if not hyponyms:
            reached_leaf = True
        else:
            for hyponym in hyponyms:
                # Get the embedding of the hyponym
                hyponym_embedding = get_synset_embedding(hyponym)
        
                # Compute the similarity between the target definition and the current hyponym
                hyponym_similarity = target_doc.similarity(hyponym_embedding)
                
                # If the similarity is greater than the highest similarity, update the highest similarity and the current synset
                if hyponym_similarity > highest_similarity:
                    highest_similarity = hyponym_similarity
                    best_hyponym = hyponym
            # now we found the best hyponym for the current synset
            previous_similarity = current_similarity
            current_synset = best_hyponym
            current_similarity = highest_similarity

    return current_synset

In [143]:
test_method(find_best_synset)

- DOOR:
	- A construction used to divide two rooms, temporarily closing the passage between them
		- Synset('relation.n.01')
	- It's an opening, it can be opened or closed.
		- Synset('diagonal.n.04')
	- An object that divide two room, closing an hole in a wall. You can open the door to let people enter or get out.
		- Synset('change.n.06')
	- Usable for access from one area to another
		- Synset('communication.n.03')
	- Structure that delimits an area and allows access to it
		- Synset('abstraction.n.06')
	- an object that is used to block passage but can be moved to pass
		- Synset('abstraction.n.06')
	- An assembled object, historically made of wood, but also of iron or other materials, used to separate rooms in a building. Sometimes opened by moving a handle, or pushed, or locked and requires some means to unlock. it consists of the main body, the hinges on which it rotates, and a lock.
		- Synset('abstraction.n.06')
	- object used to go through rooms separate by a wall, can be ope

Approach 3: check similarity between word embeddings in SpaCy vocabulary and definition embeddings

In [144]:
def embedding_similarity(definition):
    # Embed the input definition
    definition_doc = embed_sentence(definition)

    # Calculate similarity between the input definition and all words in the vocabulary
    max_similarity = -1
    associated_word = None

    for word_embedding in embedder.vocab:
        # Ignore stopwords and punctuation
        if word_embedding.is_stop or word_embedding.is_punct:
            continue
        
        # Calculate similarity between the input definition and the current word
        similarity = definition_doc.similarity(word_embedding)
        
        # Update associated_word if similarity is higher
        if similarity > max_similarity:
            max_similarity = similarity
            associated_word = word_embedding.text
            
    return associated_word

In [145]:
test_method(embedding_similarity)

- DOOR:
	- A construction used to divide two rooms, temporarily closing the passage between them
		- separate
	- It's an opening, it can be opened or closed.
		- opened
	- An object that divide two room, closing an hole in a wall. You can open the door to let people enter or get out.


  similarity = definition_doc.similarity(word_embedding)


		- inside
	- Usable for access from one area to another
		- usable
	- Structure that delimits an area and allows access to it
		- access
	- an object that is used to block passage but can be moved to pass
		- intended
	- An assembled object, historically made of wood, but also of iron or other materials, used to separate rooms in a building. Sometimes opened by moving a handle, or pushed, or locked and requires some means to unlock. it consists of the main body, the hinges on which it rotates, and a lock.
		- actuating
	- object used to go through rooms separate by a wall, can be opened or closed
		- enclosing
	- something that can be opened, in order to access to another place
		- place
	- the access to a room
		- room
	- an object that allows access to a room
		- allows
	- Enclosing of an entrance that blocks off intruders as well as weather conditions. Can usually be locked with a key of some sort. At times it presents a small opening through which one can see outside.
		- normally

Approach 4:
 - Get the most frequent words in the definition (mfw)
 - Get synsets of mfw, its hyponims and hyperonims (mfw_synsets)
 - Calculate signature of mfw_synsets (mfw_synsets_signature)
 - Get synsets of the mfw_synsets with the best overlap between mfw_synsets_signature and definition signature

In [146]:
def get_mfw(definition: str) -> List[str]:
    # Get the most frequent words in the definition
    definition_words = clean_sentence(definition)
    word_frequency = Counter(definition_words)
    mfw = [word for word, _ in word_frequency.most_common(10)]
    return mfw


def get_hypo_hyper(synset):
    hyponyms = synset.hyponyms()
    hypernyms = synset.hypernyms()
    return hyponyms + hypernyms


def get_synsets_for_mfw(mfw: List[str]) -> List[Synset]:
    synsets: List[Synset] = []
    for word in mfw:
        synsets += wn.synsets(word)
        for synset in wn.synsets(word):
            # get hyponyms and hypernyms of the synset
            synsets += get_hypo_hyper(synset)
    return synsets

Calculate the signature of the synsets of the most frequent words in the definition

In [147]:
def approach4(definition: str):
    # Get the most frequent words in the definition
    mfw = get_mfw(definition)
    
    # Get the synsets for the most frequent words
    mfw_synsets = get_synsets_for_mfw(mfw)
    
    # Get the signature of the synsets and the definition
    synset_signatures = [get_synset_signature(synset) for synset in mfw_synsets]
    definition_signature = clean_sentence(definition)
    
    max_overlap = 0
    best_synset = None
    for synset, synset_signature in zip(mfw_synsets, synset_signatures):
        overlap = len(set(definition_signature).intersection(set(synset_signature)))
        if overlap > max_overlap:
            max_overlap = overlap
            best_synset = synset
    return best_synset

In [148]:
test_method(approach4)

- DOOR:
	- A construction used to divide two rooms, temporarily closing the passage between them
		- Synset('construction.n.05')
	- It's an opening, it can be opened or closed.
		- Synset('open.v.09')
	- An object that divide two room, closing an hole in a wall. You can open the door to let people enter or get out.
		- Synset('doorway.n.01')
	- Usable for access from one area to another
		- Synset('access.v.02')
	- Structure that delimits an area and allows access to it
		- Synset('area.n.05')
	- an object that is used to block passage but can be moved to pass
		- Synset('chock.n.01')
	- An assembled object, historically made of wood, but also of iron or other materials, used to separate rooms in a building. Sometimes opened by moving a handle, or pushed, or locked and requires some means to unlock. it consists of the main body, the hinges on which it rotates, and a lock.
		- Synset('moon.n.02')
	- object used to go through rooms separate by a wall, can be opened or closed
		- Synset('

### Eventually todo: use merged definitions