In [452]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn, stopwords
from nltk.corpus.reader import Synset
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from typing import Dict, List, Tuple

In [453]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gianl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [454]:
definitions = pd.read_csv('resources/definitions.tsv', sep='\t')
definitions.head()

# remove index from the dataframe (for each row it is the first element)
definitions = definitions.iloc[:, 1:]
definitions.head()

Unnamed: 0,door,ladybug,pain,blurriness
0,"A construction used to divide two rooms, tempo...","small flying insect, typically red with black ...",A feeling of physical or mental distress,sight out of focus
1,"It's an opening, it can be opened or closed.","It is an insect, it has wings, red with black ...","It is a feeling, physical or emotional. It is ...","It is the absence of definite borders, shapele..."
2,"An object that divide two room, closing an hol...",An insect that can fly. It has red or orange c...,A felling that couscious beings can experince ...,A sensation felt when you can't see clearly th...
3,Usable for access from one area to another,Small insect with a red back,Concept that describes a suffering living being,Lack of sharpness
4,Structure that delimits an area and allows acc...,Small round flying insect,Feeling of physical discomfort,Characteristic of lack of clarity or precision


In [455]:
# convert the dataframe to a dictionary for easier access
definitions_dict: Dict[str, List[str]] = {}
for column in definitions.columns:
    definitions_dict[column] = definitions[column].tolist()

In [456]:
# print every word and one of its definitions
for word in definitions_dict:
    print(f'- {word.upper()}: \n\t{definitions_dict[word][0]}')

- DOOR: 
	A construction used to divide two rooms, temporarily closing the passage between them
- LADYBUG: 
	small flying insect, typically red with black spots with six legs
- PAIN: 
	A feeling of physical or mental distress
- BLURRINESS: 
	sight out of focus


### Function to merge definition strings in one single string

In [457]:
def merge_definitions(definitions: Dict[str, List[str]]):
    merged_definitions = {}
    for word in definitions:
        merged_definitions[word] = ' '.join(definitions[word])
        
    return merged_definitions

In [458]:
print(merge_definitions(definitions_dict)['door'])

A construction used to divide two rooms, temporarily closing the passage between them It's an opening, it can be opened or closed. An object that divide two room, closing an hole in a wall. You can open the door to let people enter or get out. Usable for access from one area to another Structure that delimits an area and allows access to it an object that is used to block passage but can be moved to pass An assembled object, historically made of wood, but also of iron or other materials, used to separate rooms in a building. Sometimes opened by moving a handle, or pushed, or locked and requires some means to unlock. it consists of the main body, the hinges on which it rotates, and a lock. object used to go through rooms separate by a wall, can be opened or closed something that can be opened, in order to access to another place the access to a room an object that allows access to a room Enclosing of an entrance that blocks off intruders as well as weather conditions. Can usually be loc

In [459]:
lemmatizer = WordNetLemmatizer()

def clean_word_list(word_list: List[str]):
    # convert to lower case
    word_list = [word.lower() for word in word_list]
    # tokenize the words
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = tokenizer.tokenize(' '.join(word_list))
    # # remove punctuation
    # word_list = [word for word in word_list if word not in punctuation]
    # remove stop words using the nltk stop words list
    word_list = [word for word in word_list if word not in stopwords.words('english')]
    # lemmatize the words
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    return word_list

### Function to calculate the signature of the definitions

For each word, the signature is a list of words that are present in the definitions of the word. The words are lemmatized and the punctuation and stop words are removed.

In [460]:
# calculate signature of definitions
def calculate_signature(definitions: Dict[str, List[str]]):
    merged_definitions = merge_definitions(definitions)
    signature = {}
    for word in merged_definitions:
        word_list = merged_definitions[word].split()
        word_list = clean_word_list(word_list)
        signature[word] = word_list
    return signature

In [461]:
signatures = calculate_signature(definitions_dict)
print(signatures['door'])

['construction', 'used', 'divide', 'two', 'room', 'temporarily', 'closing', 'passage', 'opening', 'opened', 'closed', 'object', 'divide', 'two', 'room', 'closing', 'hole', 'wall', 'open', 'door', 'let', 'people', 'enter', 'get', 'usable', 'access', 'one', 'area', 'another', 'structure', 'delimits', 'area', 'allows', 'access', 'object', 'used', 'block', 'passage', 'moved', 'pas', 'assembled', 'object', 'historically', 'made', 'wood', 'also', 'iron', 'material', 'used', 'separate', 'room', 'building', 'sometimes', 'opened', 'moving', 'handle', 'pushed', 'locked', 'requires', 'mean', 'unlock', 'consists', 'main', 'body', 'hinge', 'rotates', 'lock', 'object', 'used', 'go', 'room', 'separate', 'wall', 'opened', 'closed', 'something', 'opened', 'order', 'access', 'another', 'place', 'access', 'room', 'object', 'allows', 'access', 'room', 'enclosing', 'entrance', 'block', 'intruder', 'well', 'weather', 'condition', 'usually', 'locked', 'key', 'sort', 'time', 'present', 'small', 'opening', 'on

In [462]:
# # Get the root synset for the noun hierarchy
# root_synset = wn.synset('entity.n.01')
# 
# # Retrieve synsets related to the root synset (e.g., hyponyms)
# hyponyms = root_synset.hyponyms()
# 
# # Print the synsets
# for synset in hyponyms:
#     print(synset)
#     print(synset.definition())
#     print(synset.examples())

In [463]:
# extract synset of 10 most frequent words in definitions
def get_synsets_for_word(word: str) -> List[Synset]:
    # calculate words frequency for the set of definitions of a word
    word_frequency = Counter(signatures[word])
    most_common_words: List[Tuple] = word_frequency.most_common(10)
    word = most_common_words[0][0]

    if wn.synsets(word):
        synset = wn.synsets(word)[0]
        if synset.pos() == 'n':
            print(synset)
    
    # get 10 most common nouns from most common words
    most_common_words = [word_tuple for word_tuple in most_common_words if wn.synsets(word_tuple[0]) and wn.synsets(word_tuple[0])[0].pos() == 'n']
    
    print(most_common_words)
    
    # get the synsets for the most common words
    synsets: List[Synset] = [wn.synsets(word_tuple[0])[0] for word_tuple in most_common_words]
    return synsets

In [464]:
get_synsets_for_word('door')

Synset('room.n.01')
[('room', 14), ('object', 14), ('access', 11), ('space', 7), ('wall', 5), ('block', 5), ('two', 4)]


[Synset('room.n.01'),
 Synset('object.n.01'),
 Synset('entree.n.02'),
 Synset('space.n.01'),
 Synset('wall.n.01'),
 Synset('block.n.01'),
 Synset('two.n.01')]