In [118]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn, stopwords
from nltk.corpus.reader import Synset
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from typing import Dict, List, Tuple

In [119]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gianl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [120]:
definitions = pd.read_csv('resources/definitions.tsv', sep='\t')
definitions.head()

# remove index from the dataframe (for each row it is the first element)
definitions = definitions.iloc[:, 1:]
definitions.head()

Unnamed: 0,door,ladybug,pain,blurriness
0,"A construction used to divide two rooms, tempo...","small flying insect, typically red with black ...",A feeling of physical or mental distress,sight out of focus
1,"It's an opening, it can be opened or closed.","It is an insect, it has wings, red with black ...","It is a feeling, physical or emotional. It is ...","It is the absence of definite borders, shapele..."
2,"An object that divide two room, closing an hol...",An insect that can fly. It has red or orange c...,A felling that couscious beings can experince ...,A sensation felt when you can't see clearly th...
3,Usable for access from one area to another,Small insect with a red back,Concept that describes a suffering living being,Lack of sharpness
4,Structure that delimits an area and allows acc...,Small round flying insect,Feeling of physical discomfort,Characteristic of lack of clarity or precision


In [121]:
# convert the dataframe to a dictionary for easier access
definitions_dict: Dict[str, List[str]] = {}
for column in definitions.columns:
    definitions_dict[column] = definitions[column].tolist()

In [122]:
# print every word and one of its definitions
for word in definitions_dict:
    print(f'- {word.upper()}: \n\t{definitions_dict[word][0]}')

- DOOR: 
	A construction used to divide two rooms, temporarily closing the passage between them
- LADYBUG: 
	small flying insect, typically red with black spots with six legs
- PAIN: 
	A feeling of physical or mental distress
- BLURRINESS: 
	sight out of focus


### Function to calculate the signature of the definitions

For each word, the signature is a list of words that are present in the definitions of the word. The words are lemmatized and the punctuation and stop words are removed.

In [123]:
lemmatizer = WordNetLemmatizer()

def clean_sentence(sentence: str) -> List[str]:
    # convert to list of words
    word_list = sentence.split()
    # convert to lower case
    word_list = [word.lower() for word in word_list]
    # tokenize the words and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = tokenizer.tokenize(' '.join(word_list))
    # remove stop words using the nltk stop words list
    word_list = [word for word in word_list if word not in stopwords.words('english')]
    # lemmatize the words
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    return word_list

In [124]:
# extract synset of 10 most frequent words in definitions
def get_synsets_for_word(signature: List[str]) -> List[Synset]:
    # calculate words frequency for a definition signature
    word_frequency = Counter(signature)
    most_common_words: List[Tuple] = word_frequency.most_common(10)
    
    # get 10 most common nouns from most common words
    most_common_nouns = [word_tuple for word_tuple in most_common_words if wn.synsets(word_tuple[0]) and wn.synsets(word_tuple[0])[0].pos() == 'n']
    
    print(most_common_nouns)
    
    # get the synsets for the most common nouns
    synsets: List[Synset] = [wn.synsets(word_tuple[0])[0] for word_tuple in most_common_nouns]
    return synsets

In [126]:
door_def_1 = definitions_dict['door'][0]
signature = clean_sentence(door_def_1)
print(signature)
get_synsets_for_word(signature)

['construction', 'used', 'divide', 'two', 'room', 'temporarily', 'closing', 'passage']
[('construction', 1), ('divide', 1), ('two', 1), ('room', 1), ('closing', 1), ('passage', 1)]


[Synset('construction.n.01'),
 Synset('divide.n.01'),
 Synset('two.n.01'),
 Synset('room.n.01'),
 Synset('shutting.n.01'),
 Synset('passage.n.01')]

## TODO

Starting from the synsets of the most common words in the definitions, navigate the WordNet graph to find the synset with maximum overlap between definition signature and synset signature.