In [15]:
import nltk
from nltk.corpus import wordnet_ic
import numpy as np
import nltk.tokenize
from nltk.corpus import wordnet as wn
import scipy.spatial.distance as distance
import gensim


nltk.download('wordnet_ic')
nltk.download('wordnet')
nltk.download('omw')

[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\Jukka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jukka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw to
[nltk_data]     C:\Users\Jukka\AppData\Roaming\nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

Tutorial based on http://www.nltk.org/howto/wordnet.html

## Words
We can query words/lemmas with wn.synsets("word")

We can optionally pass arguments as <b>"word.pos.nn"</b> to query a specific lemma

Also possible to translate meanings to different languages

In [16]:
# All meanings for word cool
print(wn.synsets('cool'))
# Most frequent meaning and translations
print("============")
print(wn.synset('cool.n.01').definition())
print(wn.synset('cool.n.01').lemma_names('eng'))
print(wn.synset('cool.n.01').lemma_names('fin'))
print("============")
# We can also map back to english!
print(wn.synsets('viileys', lang='fin'))

[Synset('cool.n.01'), Synset('aplomb.n.01'), Synset('cool.v.01'), Synset('cool.v.02'), Synset('cool.v.03'), Synset('cool.a.01'), Synset('cool.s.02'), Synset('cool.a.03'), Synset('cool.a.04'), Synset('cool.s.05'), Synset('cool.s.06')]
the quality of being at a refreshingly low temperature
['cool']
['viileys', 'vilpoisuus']
[Synset('coldness.n.02'), Synset('aplomb.n.01'), Synset('coldness.n.03'), Synset('chilliness.n.01'), Synset('cool.n.01'), Synset('distance.n.04'), Synset('withdrawal.n.04')]


## Synsets

A set of words that share a common meaning

Each synset has one or more lemmas

Lemmas in synset then contains 
* Gloss - dictionary like definition
* Examples - Examples of word usage

In [17]:
# Function to explore word synsets
# Input a word, optional language and Noun (n), Verb(v), Adj(a)
def print_word_synsets(word, l='eng', p=""):
    print('Count of near synonyms: ' + str(len(wn.synsets(word, lang=l, pos=p))))
    for syn in wn.synsets(word, lang=l, pos=p):
        print("====================================================")
        print("Synset: " + syn.name())
        print("Gloss: " + wn.synset(syn.name()).definition())
        print("examples: " + str(len(wn.synset(syn.name()).examples())))
        for example in wn.synset(syn.name()).examples():
            print("\t" + example)

In [18]:
print_word_synsets('ice', l='eng', p="v")

Count of near synonyms: 3
Synset: frost.v.01
Gloss: decorate with frosting
examples: 1
	frost a cake
Synset: ice.v.02
Gloss: cause to become ice or icy
examples: 1
	an iced summer drink
Synset: ice.v.03
Gloss: put ice on or put on ice
examples: 1
	Ice your sprained limbs


## Similarities

We can measure word similarities on wordnet

In wordnet similarity means that a word can be replaced by another one always

Word similarity can mean <b>similarity</b> or <b>words being related </b>
* Similar <-> tractor, car
* Related but not similar car <-> gasoline

Words are defined to be similar if
* they share meaning(s) 
* are near synonyms 


In [19]:
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')

# Path similarity

Path similarity is calculated by how many edges there are between two words

for $pathlen(a,b)$ we pick the edge with least distance

Every edge is defined to have the same weight

$sim(a,b) = \frac{1}{pathlen(a,b)}$

In [20]:
dog.path_similarity(cat)

0.2

## Information content

IC tries to fix that every edge has the same weight

We can use information content to determine similarities between words. Always based on probability of finding a word in a corpus

In NLTK we can download ready made IC-dictionaries or use our own.


## Resnik similarity

Similarity based on LCS, that refers to lowest common subsumer. e.g. (word with lowest distance from two concepts)
* cat, dog have hypernym of animal and mammal
* mammal is lower common hypernym

$P(c) =  \frac{\sum count(w)}{N}$

$ LCS(a, b) = $ first node that's hypernym for $a,b$ 

$sim(a,b)_{resnik} = -log( p( LCS(a, b ) )$


In [21]:
# Load and use ic-libraries
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
dog.res_similarity(cat, brown_ic)

7.911666509036577

## Lin-similarity

Another way to calculate similarities based on information contents (based on resniks method)

$sim(a,b) = \frac{2 log P(LCS(a,b))}{log P(a) + log P(b)}$

In [22]:
dog.lin_similarity(cat, brown_ic)

0.8768009843733973

## Worthwile to mention

* Myrphs -> we can try to lemmatize the word
* Domains of a word 

In [23]:
print(wn.morphy('churches'))
print(wn.synset('code.n.03').topic_domains())

church
[Synset('computer_science.n.01')]


## Lesk algorithm

Tries to decode a word meaning based on the surrounding words.

We loop over all the senses of the word and choose one that shares most words with the target

simplified lesk algorithm in the book didnt seem to include hyponyms, but it seems to work better with those (as we get more descriptions for a word)

<b> Is similarity based on glosses </b>

In [24]:
# Calculate how many words in query sentance agree with examples
def overlapcontext(sense, sentence):
    sentence = nltk.word_tokenize(sentence)
    gloss = set(nltk.word_tokenize(sense.definition()))
    for expmle in sense.examples():
         gloss.union(nltk.word_tokenize(expmle))
    return len(gloss.intersection(sentence))

# Compare senses of a word and pick one that has most overlapping words
def lesk(word, sentence):
    bestsense = None
    maxoverlap = 0
    # Try to lemmatize the word with morphy
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    # Loop all the possible meanings of the word
    for sense in wn.synsets(word):
        overlap = overlapcontext(sense, sentence)
        # Subtypes might be related to word!
        for h in sense.hyponyms():
            overlap = overlap + overlapcontext(h, sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = sense
    return bestsense

In [25]:
sense = lesk("cone", "A traffic cone was tipped over")
print(str(sense) + " | " + str(sense.definition()))
print(str(sense.hyponyms()))

Synset('cone.n.03') | cone-shaped mass of ovule- or spore-bearing scales or bracts
[Synset('fir_cone.n.01'), Synset('galbulus.n.01'), Synset('pinecone.n.01')]


## Lesk algorithm with W2V similarity measure

Instead of counting the overlap, we can pick the meaning that is most similar to the query sentance 

In [12]:
# Load google's w2v
word2vec = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True, limit=200000) 

In [13]:
# Calculate w2v cosine sim between two words
def calculate_w2v_cossim(sentance1, sentance2):
    doc1 = [word2vec.wv[word] for word in sentance1 if word in word2vec.wv]
    doc2 = [word2vec.wv[word] for word in sentance2 if word in word2vec.wv]
    
    doc1 = np.mean(doc1, axis=0)
    doc2 = np.mean(doc2, axis=0)
    return 1 - distance.cosine(doc1, doc2)

# loop over all examples of a sense and pick the most similar one
def overlap_w2v_context(sense, sentence):
    sentence = nltk.word_tokenize(sentence)
    gloss = nltk.word_tokenize(sense.definition())
    
    similarities = []
    similarities.append(calculate_w2v_cossim(sentence, gloss))
    for expmle in sense.examples():
        similarities.append(calculate_w2v_cossim(sentence, nltk.word_tokenize(expmle)))
    return np.max(similarities)
    
# w2v - Lesk algorithm!
def lesk_w2v(word, sentence):
    bestsense = None
    maxsimilarity = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    # Loop all the possible meanings of the word
    for sense in wn.synsets(word):
        tmp = []
        tmp.append(overlap_w2v_context(sense, sentence))
        for h in sense.hyponyms():
            tmp.append(overlapcontext(h, sentence)) 
        # Only pick most similar entry for use, so we dont favor meanings with many examples
        similarity = np.max(tmp)
        if similarity > maxsimilarity:
                maxsimilarity = similarity
                bestsense = sense
    return bestsense

In [14]:
sense = lesk_w2v("cook", "i like to cook food")
sense2 = lesk("cook", "i like to cook food")
print(sense)
print(sense.definition())
print("========================")
print(sense2)
print(sense2.definition())

Synset('cook.n.01')
someone who cooks food
Synset('cook.v.03')
transform and make suitable for consumption by heating


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
