# WordNet

In [225]:
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.wsd import lesk
import nltk
import collections
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.wsd import lesk

In [2]:
dogSyns = wn.synsets('dog')

In [3]:
dogSyns

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [4]:
dog = dogSyns[0]

In [5]:
dog.definition()

'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds'

In [6]:
dog.hyponyms()

[Synset('basenji.n.01'),
 Synset('corgi.n.01'),
 Synset('cur.n.01'),
 Synset('dalmatian.n.02'),
 Synset('great_pyrenees.n.01'),
 Synset('griffon.n.02'),
 Synset('hunting_dog.n.01'),
 Synset('lapdog.n.01'),
 Synset('leonberg.n.01'),
 Synset('mexican_hairless.n.01'),
 Synset('newfoundland.n.01'),
 Synset('pooch.n.01'),
 Synset('poodle.n.01'),
 Synset('pug.n.01'),
 Synset('puppy.n.01'),
 Synset('spitz.n.01'),
 Synset('toy_dog.n.01'),
 Synset('working_dog.n.01')]

In [7]:
dog.hypernyms()

[Synset('canine.n.02'), Synset('domestic_animal.n.01')]

In [8]:
dog.hypernyms()[0].hypernyms()

[Synset('carnivore.n.01')]

In [9]:
dog.hypernyms()[0].hypernyms()[0].hypernyms()

[Synset('placental.n.01')]

In [10]:
mammal = dog.hypernyms()[0].hypernyms()[0].hypernyms()[0].hypernyms()[0]

In [11]:
mammal.max_depth(), mammal.min_depth()

(9, 9)

In [12]:
dog.max_depth(), dog.min_depth()

(13, 8)

In [13]:
mammal.hypernyms()

[Synset('vertebrate.n.01')]

In [14]:
def getHypernyms(word):
    print(word.max_depth())
    return word.hypernyms()
mammal.tree(getHypernyms)

9
8
7
6
5
4
3
2
1
0


[Synset('mammal.n.01'),
 [Synset('vertebrate.n.01'),
  [Synset('chordate.n.01'),
   [Synset('animal.n.01'),
    [Synset('organism.n.01'),
     [Synset('living_thing.n.01'),
      [Synset('whole.n.02'),
       [Synset('object.n.01'),
        [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]]

In [226]:
def getDepthNHypernym(word, n=5): 
    if word.max_depth() > n: 
        hypernyms = word.hypernyms()
        # Check to make sure it's not an empty list, and that it is a list,
        # otherwise indexing with [0] will fail. 
        if type(word.hypernyms()) is list and len(word.hypernyms())>0:
            return getDepthNHypernym(word.hypernyms()[0], n)
    return word.hypernyms()

In [229]:
getDepthNHypernym(mammal, 5)

[Synset('living_thing.n.01')]

In [230]:
getDepthNHypernym(dog)

[Synset('living_thing.n.01')]

In [231]:
house = wn.synsets('house')[0]

In [232]:
getDepthNHypernym(house)

[Synset('artifact.n.01')]

In [36]:
garden = open('/home/jon/Code/course-computational-literary-analysis/Homework/garden.md').read()

In [233]:
gardenTokens = nltk.word_tokenize(garden)

In [234]:
gardenTags = nltk.pos_tag(gardenTokens)

In [235]:
gardenNoStops = [token for token in gardenTags if token[0] not in stopwords.words('english')]

In [236]:
gardenTags[40:50]

[(',', ','),
 ('the', 'DT'),
 ('sky', 'NN'),
 ('without', 'IN'),
 ('a', 'DT'),
 ('cloud', 'NN'),
 ('.', '.'),
 ('Only', 'RB'),
 ('the', 'DT'),
 ('blue', 'NN')]

In [237]:
wn.synsets('cold')[0].definition()

'a mild viral infection involving the nose and respiratory passages (but not the lungs)'

In [238]:
wn.synsets('cold')[3].definition()

'having a low or inadequate temperature or feeling a sensation of coldness or having been made cold by e.g. ice or refrigeration'

In [239]:
wn.synsets('cold', pos='a')

[Synset('cold.a.01'),
 Synset('cold.a.02'),
 Synset('cold.s.03'),
 Synset('cold.s.04'),
 Synset('cold.s.05'),
 Synset('cold.s.06'),
 Synset('cold.s.07'),
 Synset('cold.s.08'),
 Synset('cold.s.09'),
 Synset('cold.s.10'),
 Synset('cold.s.11'),
 Synset('cold.s.12'),
 Synset('cold.s.13')]

In [240]:
wn.synsets('cold', pos='n')

[Synset('cold.n.01'), Synset('coldness.n.03'), Synset('cold.n.03')]

In [241]:
tokenizer = TreebankWordTokenizer()

In [242]:
gardenSents = nltk.sent_tokenize(garden)

In [281]:
spans = list(tokenizer.span_tokenize(garden))
words = [garden[span[0]:span[1]] for span in spans]

In [282]:
categories = []
for sent in gardenSents: 
    tokens = nltk.word_tokenize(sent)
    wordsAndTags = nltk.pos_tag(tokens)
    for wordAndTag in wordsAndTags: 
        word, tag = wordAndTag
        if tag == 'NN' or tag == 'NNS': 
            # Do Word Sense Disambiguation (WSD)
            sense = nltk.wsd.lesk(sent, word, pos='n')
            if sense is not None: 
                cat = getDepthNHypernym(sense, 5)
                if type(cat) is list and len(cat)>0: 
                    categories.append((word, cat[0]))

In [283]:
categories[:10]

[('weather', Synset('natural_phenomenon.n.01')),
 ('day', Synset('cosmic_time.n.01')),
 ('sky', Synset('gas.n.02')),
 ('cloud', Synset('group.n.01')),
 ('blue', Synset('substance.n.01')),
 ('haze', Synset('natural_phenomenon.n.01')),
 ('gold', Synset('substance.n.01')),
 ('summer', Synset('time_period.n.01')),
 ('gardener', Synset('physical_entity.n.01')),
 ('dawn', Synset('time_period.n.01'))]

# An HTML/Javascript explorer for these categories

In [276]:
replacementDict = {}
for wordAndCat in categories: 
    word, cat = wordAndCat
    readableName = cat.name().split('.')[0]
    color = colorMap.get(cat, '')
    wrappedWord = '<span class="{}">{}</span>'.format(readableName, word)
    #print(word, wrappedWord)
    replacementDict[word] = wrappedWord

In [277]:
replaced = []
for token in gardenTokens: 
    if token in replacementDict: 
        replaced.append(replacementDict[token])
    else: 
        replaced.append(token)

In [285]:
detokenized = TreebankWordDetokenizer().detokenize(replaced)
niceCatList = [cat[0].name().split('.')[0] 
               for cat in collections.Counter([cat[1] for cat in categories]).most_common(15)]
catList = ''.join(["""<button onClick="show('{}')">{}</button>""".format(cat, cat) for cat in niceCatList])
out = '<div id="controls"><ul>{}</ul></div>'.format(catList)
text = '<div id="text">{}</div>'.format(detokenized)
text = '<h2>Instructions: click one of the buttons to the right. Reload the page to reset it.</h2>' + text
wrapper = '<main style="display: flex;">{}</main>'.format(text + out)
scripts = """<script>function show(c){document.querySelectorAll('span.' + c).forEach(function(i){i.style.color='red';})}</script>"""
page = '<html><head>{}</head><body>{}</body></html>'.format(scripts, wrapper)
open('categorized.html', 'w').write(page)

61397