First of all we need to read the definitions for each word from the file `definitions.tsv` and store them.

In [28]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from typing import Dict, List

from resources.constants import punctuation, stop_words

In [29]:
definitions = pd.read_csv('resources/definitions.tsv', sep='\t')
definitions.head()

# remove index from the dataframe (for each row it is the first element)
definitions = definitions.iloc[:, 1:]
definitions.head()

# convert the dataframe to a dictionary for easier access
definitions_dict: Dict[str, List[str]] = {}
for column in definitions.columns:
    definitions_dict[column] = definitions[column].tolist()

In [30]:
# print every word and one of its definitions
for word in definitions_dict:
    print(f'- {word.upper()}: \n\t{definitions_dict[word][0]}')

- DOOR: 
	A construction used to divide two rooms, temporarily closing the passage between them
- LADYBUG: 
	small flying insect, typically red with black spots with six legs
- PAIN: 
	A feeling of physical or mental distress
- BLURRINESS: 
	sight out of focus


In [31]:
lemmatizer = WordNetLemmatizer()

def clean_word_list(word_list: List[str]):
    # remove punctuation
    word_list = [word for word in word_list if word not in punctuation]
    # remove stop words
    word_list = [word for word in word_list if word not in stop_words]
    # lemmatize the words
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    return word_list

In [32]:
def sentence_similarity(sentence1: str, sentence2: str):
    # split the definitions into words
    words1 = sentence1.split()
    words2 = sentence2.split()
    words1 = clean_word_list(words1)
    words2 = clean_word_list(words2)
    # compute the intersection of the two definitions
    intersection = len(set(words1).intersection(set(words2)))
    # return the similarity, dividing by the length of the smaller definition
    return intersection / min(len(words1), len(words2))

In [33]:
def definition_similarity(word: str) -> float:
    # if the word is not in the dictionary, raise error
    if word not in definitions_dict:
        raise ValueError(f'Word {word} not found in the dictionary')
    # get word definitions
    word_definitions = definitions_dict[word]
    similarities: List[float] = []
    # compute the similarity between each pair of definitions
    for i in range(len(word_definitions)):
        for j in range(i+1, len(word_definitions)):
            def1 = word_definitions[i]
            def2 = word_definitions[j]
            similarities.append(sentence_similarity(def1, def2))
    return sum(similarities) / len(similarities)

In [34]:
# try on first word
word1 = list(definitions_dict.keys())[0]
similarities = definition_similarity(word1)
print(f'Similarities for word {word1}: {similarities}')

Similarities for word door: 0.1457782350286356


In [40]:
# go for all words
similarities = {}
for word in definitions_dict:
    similarities[word] = definition_similarity(word)

for word in similarities:
    print(f'- {word.upper()}: \n\t{similarities[word]}')

- DOOR: 
	0.1457782350286356
- LADYBUG: 
	0.3663837790561933
- PAIN: 
	0.14762269658821392
- BLURRINESS: 
	0.06509760992519609


In [41]:
# get mean of concrete concepts and abstract concepts
concrete_words = ['door', 'ladybug']
abstract_words = ['pain', 'blurriness']

concrete_similarities = [similarities[word] for word in concrete_words]
abstract_similarities = [similarities[word] for word in abstract_words]

concrete_mean = sum(concrete_similarities) / len(concrete_similarities)
abstract_mean = sum(abstract_similarities) / len(abstract_similarities)

print(f'Mean similarity for concrete words: {concrete_mean}')
print(f'Mean similarity for abstract words: {abstract_mean}')

Mean similarity for concrete words: 0.25608100704241443
Mean similarity for abstract words: 0.106360153256705


In [42]:
# get mean of words with same specificity
specific_words = ['ladybug', 'blurriness']
general_words = ['door', 'pain']

specific_similarities = [similarities[word] for word in specific_words]
general_similarities = [similarities[word] for word in general_words]

specific_mean = sum(specific_similarities) / len(specific_similarities)
general_mean = sum(general_similarities) / len(general_similarities)

print(f'Mean similarity for specific words: {specific_mean}')
print(f'Mean similarity for general words: {general_mean}')

Mean similarity for specific words: 0.2157406944906947
Mean similarity for general words: 0.14670046580842477
