In [13]:
import nltk
from tabulate import tabulate
from nltk.corpus import wordnet_ic
import numpy as np
nltk.download('wordnet')
nltk.download('wordnet_ic')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
from nltk.corpus import wordnet as wn

In [3]:
word_pairs = (('the','DT'), ('man','NN'), ('swim','VB'), ('with', 'PR'), ('a', 'DT'),
('girl','NN'), ('and', 'CC'), ('a', 'DT'), ('boy', 'NN'), ('whilst', 'PR'),
('the', 'DT'), ('woman', 'NN'), ('walk', 'VB'))

In [14]:

# Mapping the tags between Treebank and WordNet
tag_map = {
  'CC':"none", # coordin. conjunction (and, but, or)  
  'CD':"n", # cardinal number (one, two)             
  'DT':"none", # determiner (a, the)                    
  'EX':"r", # existential ‘there' (there)           
  'FW':"none", # foreign word (mea culpa)             
  'IN':"r", # preposition/sub-conj (of, in, by)   
  'JJ':"a", # adjective (yellow)                  
  'JJR':"a", # adj., comparative (bigger)          
  'JJS':"a", # adj., superlative (wildest)           
  'LS':"none", # list item marker (1, 2, One)          
  'MD':"none", # modal (can, should)                    
  'NN':"n", # noun, sing. or mass (llama)          
  'NNS':"n", # noun, plural (llamas)                  
  'NNP':"n", # proper noun, sing. (IBM)              
  'NNPS':"n", # proper noun, plural (Carolinas)
  'PDT':"a", # predeterminer (all, both)            
  'POS':"none", # possessive ending ('s )               
  'PRP':"none", # personal pronoun (I, you, he)     
  'PRP$':"none", # possessive pronoun (your, one's)    
  'RB':"r", # adverb (quickly, never)            
  'RBR':"r", # adverb, comparative (faster)        
  'RBS':"r", # adverb, superlative (fastest)     
  'RP':"a", # particle (up, off)
  'SYM':"none", # symbol (+,%, &)
  'TO':"none", # “to” (to)
  'UH':"none", # interjection (ah, oops)
  'VB':"v", # verb base form (eat)
  'VBD':"v", # verb past tense (ate)
  'VBG':"v", # verb gerund (eating)
  'VBN':"v", # verb past participle (eaten)
  'VBP':"v", # verb non-3sg pres (eat)
  'VBZ':"v", # verb 3sg pres (eats)
  'WDT':"none", # wh-determiner (which, that)
  'WP':"none", # wh-pronoun (what, who)
  'WP$':"none", # possessive (wh- whose)
  'WRB':"none", # wh-adverb (how, where)
}

def tb_to_wn(tag): 
  if(tag in tag_map):
    return tag_map[tag]
  else:
    return "none"
  return tag

In [16]:
brown_ic = wordnet_ic.ic('ic-brown.dat')

#Retrieving the synsets from the word pairs
words = [pair[0] for pair in word_pairs if tb_to_wn(pair[1]) != "none"]
synsets = [wn.synsets(pair[0],tb_to_wn(pair[1]))[0] for pair in word_pairs if tb_to_wn(pair[1]) != "none"]
similarity_matrix = [[None for i in range(len(words) + 1)] for j in range(len(words) + 1)]
LCS_matrix = np.copy(similarity_matrix)

#Print the most common synonyms
print("Most common synsets:")
for word,syn in zip(words,synsets):
  print(word,":",str(syn)[8:-2])

#Defining the tables
similarity_matrix[0][0] = "Similarity matrix"
LCS_matrix[0][0] = "Least common subsumer"

#Creating headers to the tables
for first_synset,i in zip(synsets,range(len(word_pairs))):
  similarity_matrix[0][i+1] = str(synsets[i])[8:-2]
  similarity_matrix[i+1][0] = str(synsets[i])[8:-2]
  LCS_matrix[i+1][0]= str(synsets[i])[8:-2]
  LCS_matrix[0][i+1]= str(synsets[i])[8:-2]
#Compute the similarities
  for second_synset,j in zip(synsets,range(len(word_pairs))):
    similarities = "{:0.2f}  | ".format(first_synset.path_similarity(second_synset))

    if(str(first_synset)[-6] == str(second_synset)[-6]): #If the POS tags of the synsets are idetical
      similarities += "{:0.2f}  | ".format((first_synset.lch_similarity(second_synset)-0.7472144018302211)/(first_synset.lch_similarity(first_synset)-0.7472144018302211)) #normalizing between 1 and 0
      similarities += "{:0.2f}  | ".format(first_synset.lin_similarity(second_synset,brown_ic),2)
      LCS_matrix[i+1][j+1] = str(first_synset.lowest_common_hypernyms(second_synset)[0])[8:-2]
    else:
      similarities += "----- | ----- | "
      LCS_matrix[i+1][j+1] = "-----"
    similarities += "{:0.2f}".format(first_synset.wup_similarity(second_synset),2)

    similarity_matrix[i+1][j+1] = similarities
    
print("\n")
print(tabulate(similarity_matrix, headers='firstrow', tablefmt='fancy_grid'))
print("The values in each cell coresponds to: Path Similarity | Leacock-Chodorow Similarity | Lin Similarity | Wu-Palmer Similarity")
print("\n")
print(tabulate(LCS_matrix, headers='firstrow', tablefmt='fancy_grid'))

Most common synsets:
man : man.n.01
swim : swim.v.01
girl : girl.n.01
boy : male_child.n.01
woman : woman.n.01
walk : walk.v.01


╒═════════════════════╤══════════════════════════════╤══════════════════════════════╤══════════════════════════════╤══════════════════════════════╤══════════════════════════════╤══════════════════════════════╕
│ Similarity matrix   │ man.n.01                     │ swim.v.01                    │ girl.n.01                    │ male_child.n.01              │ woman.n.01                   │ walk.v.01                    │
╞═════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╪══════════════════════════════╡
│ man.n.01            │ 1.00  | 1.00  | 1.00  | 1.00 │ 0.10  | ----- | ----- | 0.18 │ 0.25  | 0.52  | 0.71  | 0.63 │ 0.33  | 0.62  | 0.73  | 0.67 │ 0.33  | 0.62  | 0.79  | 0.67 │ 0.10  | ----- | ----- | 0.18 │
├─────────────────────┼───────

1.   The similarity between a synset to itself is 1 as expected, since the path disctance is 0, while a synset with far distances has a similarity value close to 0, like nouns and verbs.
2.   Between the methods, the variation of the similarities can be pretty significant. For instance between the synsets man.n.01 and girl.n.01 or woman.n.01, the biggest difference in similarity is observed, with a value of 0.46. This states that it's not possible to compare the results from different methods, since use very different formulas.
3.   The similarities are usually symmetric, but for wu-palmer similarity, the values are 0.63 for a woman-girl, while it is 0.95 for girl - woman. This is unexpected since the LCS and the respective depths are equal. Furthermore, this is not observed in other word pairs.
4.   The synsets in the table represent meanings and not only the lemmas. Different meanings are distinguished by the number. For instance, wordnet gives the example of girl.n.01 as 'a young lady of 18', while girl.n.02 is explained as 'the baby was a girl'.

Conclusion
According to the results from the provided word set, the preferable method is ln similarity. Firstly, it is the method with the most consistent output, according to the human perception of synonyms. For instance, Woman and girl should have more similarities than woman and male child.
Furthermore, the equation considers more parameters, like the information content from the frequencies in a corpus, leading to more precise and reliable values.

To finish, it's important to mention that the sample size is not big enough to reach a solid conclusion about the best method, and also that these representations of synonyms and formulas lack on flexibility to efficiently represent the common human language.

In [29]:
#Examples of meanings of synsets
# print(wn.synset("girl.n.01").examples())
# print(wn.synset("girl.n.02").examples())

['a young lady of 18']
['the baby was a girl', 'the girls were just learning to ride a tricycle']
