In [49]:
import spacy
import pywsd
from nltk.corpus import wordnet as wn
from collections import Counter
import json

In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
with open('training-data/chatterly.txt') as f: 
    chatterly = f.read()

In [10]:
chatterlyDoc = nlp(chatterly)

In [11]:
chatterlySents = chatterlyDoc.sents

In [12]:
chatterlySentsStr = [sent.text for sent in chatterlySents] 

In [13]:
len(chatterlySentsStr)

13464

In [42]:
def wsd(sents): 
    return [pywsd.disambiguate(sent, algorithm=pywsd.lesk.cosine_lesk) 
                 for sent in sents]

In [14]:
chatterlySentsStr[0]

'## Chapter 1 {#index_split_001.html#calibre_toc_2 .calibre4}\n\nOurs is essentially a tragic age, so we refuse to take it tragically.'

In [38]:
disambiguated = wsd(chatterlySentsStr)

In [39]:
len(disambiguated) 

13464

In [40]:
disambiguated[1]

[('The', None),
 ('cataclysm', Synset('catastrophe.n.03')),
 ('has', None),
 ('happened', Synset('happen.v.03')),
 (',', None),
 ('we', None),
 ('are', None),
 ('among', None),
 ('the', None),
 ('ruins', Synset('ruin.n.02')),
 (',', None),
 ('we', None),
 ('start', Synset('start.v.08')),
 ('to', None),
 ('build', Synset('build.v.07')),
 ('up', None),
 ('new', Synset('newfangled.s.01')),
 ('little', Synset('little.a.02')),
 ('habitats', Synset('habitat.n.01')),
 (',', None),
 ('to', None),
 ('have', None),
 ('new', Synset('newfangled.s.01')),
 ('little', Synset('little.a.02')),
 ('hopes', Synset('hope.n.01')),
 ('.', None)]

In [41]:
disambiguated[50]

[('They', None),
 ('were', None),
 ('at', None),
 ('once', None),
 ('cosmopolitan', Synset('cosmopolitan.a.01')),
 ('and', None),
 ('provincial', Synset('provincial.a.02')),
 (',', None),
 ('with', None),
 ('the', None),
 ('cosmopolitan', Synset('cosmopolitan.a.01')),
 ('provincialism', Synset('sectionalism.n.01')),
 ('of', None),
 ('art', Synset('art.n.01')),
 ('that', None),
 ('goes', Synset('start.v.09')),
 ('with', None),
 ('pure', Synset('pure.s.04')),
 ('social', Synset('social.s.04')),
 ('ideals', Synset('ideal.n.02')),
 ('.', None)]

In [28]:
artifact = wn.synsets('artifact')[0]

In [21]:
def isArtifact(thing): 
    return artifact in [item[0] for item in thing.hypernym_distances()]

In [24]:
def getArtifacts(disambiguated): 
    artifacts = []
    nSents = len(disambiguated)
    for i, sent in enumerate(disambiguated): 
        for word, syn in sent: 
            if syn is not None: 
                if isArtifact(syn): 
                    artifacts.append((i, nSents, word, syn))
    return artifacts

In [34]:
getArtifacts([disambiguated[1]])

[(0, 1, 'ruins', Synset('ruin.n.02'))]

In [26]:
artifacts = getArtifacts(disambiguated)

In [27]:
artifacts[0]

(1, 13464, 'ruins', Synset('ruin.n.02'))

In [51]:
artifactSyns = [art[3].name() for art in artifacts]
Counter(artifactSyns).most_common(30)

[('hand.n.08', 109),
 ('bit.n.11', 94),
 ('chair.n.01', 82),
 ('face.n.04', 75),
 ('room.n.01', 64),
 ('door.n.01', 63),
 ('head.n.29', 62),
 ('hut.n.01', 49),
 ('bungalow.n.01', 48),
 ('home_plate.n.01', 38),
 ('arm.n.04', 38),
 ('car.n.01', 32),
 ('back.n.08', 32),
 ('coat.n.01', 29),
 ('end.n.13', 27),
 ('water_system.n.02', 23),
 ('light.n.02', 22),
 ('hat.n.01', 20),
 ('road.n.01', 19),
 ('window.n.01', 19),
 ('key.n.15', 19),
 ('locomotive.n.01', 18),
 ('dormitory.n.01', 17),
 ('shirt.n.01', 17),
 ('chicken_coop.n.01', 17),
 ('mine.n.02', 16),
 ('blanket.n.01', 16),
 ('gun.n.01', 15),
 ('apparel.n.01', 15),
 ('leg.n.07', 15)]

In [47]:
artifact.name()

'artifact.n.01'

In [52]:
json.dumps(dict(Counter(artifactSyns).most_common(30)))

'{"hand.n.08": 109, "bit.n.11": 94, "chair.n.01": 82, "face.n.04": 75, "room.n.01": 64, "door.n.01": 63, "head.n.29": 62, "hut.n.01": 49, "bungalow.n.01": 48, "home_plate.n.01": 38, "arm.n.04": 38, "car.n.01": 32, "back.n.08": 32, "coat.n.01": 29, "end.n.13": 27, "water_system.n.02": 23, "light.n.02": 22, "hat.n.01": 20, "road.n.01": 19, "window.n.01": 19, "key.n.15": 19, "locomotive.n.01": 18, "dormitory.n.01": 17, "shirt.n.01": 17, "chicken_coop.n.01": 17, "mine.n.02": 16, "blanket.n.01": 16, "gun.n.01": 15, "apparel.n.01": 15, "leg.n.07": 15}'

[Synset('artifact.n.01')]

In [60]:
hand = wn.synsets('hand')[0]

In [61]:
hand.hypernym_distances()

{(Synset('body_part.n.01'), 3),
 (Synset('entity.n.01'), 7),
 (Synset('external_body_part.n.01'), 2),
 (Synset('extremity.n.05'), 1),
 (Synset('hand.n.01'), 0),
 (Synset('part.n.03'), 4),
 (Synset('physical_entity.n.01'), 6),
 (Synset('thing.n.12'), 5)}

In [64]:
wn.synsets('hand')[7].definition()

'a rotating pointer on the face of a timepiece'