# Hanks

In [1]:
from nltk.corpus import brown
from nltk.corpus import wordnet as wn
import spacy

## Lesk Algorithm

In [2]:
def lesk(word, sentence): #bag of words approach at WD
    best_sense = []
    max_overlap = -1
    context = set(sentence.split())
    for s in wn.synsets(word):
        signature = set(s.definition().split())
        for e in s.examples():
            signature.update(e.split())
        overlap = len(context & signature)
        if overlap > max_overlap:
            best_sense = s
            max_overlap = overlap
    return best_sense

## Choose a verb and select sentences from Brown corpus 

In [3]:
parser = spacy.load("en_core_web_sm")
v = ['buy', 'buys', 'bought'] # the verb to be used 
sents = [sent for sent in brown.sents() if v[0] in sent or v[1] in sent or v[2] in sent]

## (subj, obj) supersenses pairs extraction

In [4]:
pairs = []#pairs of subject/object supersenses
for sent in sents:
    subj = ''
    obj = ''
    root = False
    s = ' '.join(sent)
    for token in parser(s):
        if token.dep_ == 'ROOT' and token.text in v:
            root = True          
            print(sent)
        elif token.dep_ == 'nsubj' and token.head.text in v:
            subj = token.text
        elif token.dep_ == 'dobj' and token.head.text in v:
            obj = token.text
    if subj and obj and root:
        print('*', subj, obj)
        subsyn = lesk(subj, s)
        obsyn = lesk(obj, s)
        pairs.append((subsyn.lexname() if subsyn else 'noun.person', obsyn.lexname() if obsyn else 'noun.person'))#append supersenses, noun.person is the default value

['--', 'I', 'bought', '50', 'shares', 'of', 'Diversified', 'Growth', 'Stock', 'Fund', 'on', 'Oct.', '23', ',', '1959', ',', 'and', '50', 'more', 'shares', 'of', 'the', 'same', 'mutual', 'fund', 'on', 'Feb.', '8', ',', '1960', '.']
* I shares
['Ramsey', ',', 'as', "SMU's", 'food', 'wrangler', ',', 'buys', 'enough', 'groceries', 'to', 'serve', '32,000', 'meals', 'a', 'week', '.']
* Ramsey groceries
['I', 'bought', 'a', 'new', 'little', 'foreign', 'bomb', '.']
* I bomb
['Mantle', ',', 'more', 'concerned', 'with', 'dress', ',', 'buys', 'his', 'suits', 'four', 'at', 'a', 'time', 'at', 'Neiman-Marcus', 'in', 'Dallas', 'and', 'pays', 'as', 'much', 'as', '$250', 'each', '.']
* Mantle suits
['He', 'bought', 'up', 'Cezannes', ',', 'Braques', ',', 'Matisses', ',', 'Legers', ',', 'a', 'splendid', 'Picasso', 'series', ',', 'more', 'than', '70', 'Giacometti', 'sculptures', '.']
* He Cezannes
['What', 'to', 'buy', 'out', 'of', 'the', "year's", 'grist', 'of', 'nearly', '15,000', 'book', 'titles', '?',

## Pairs frequency

In [5]:
for pair in set(pairs):
    print(pair, pairs.count(pair), "/", len(pairs))

('noun.person', 'noun.object') 2 / 28
('noun.substance', 'noun.person') 2 / 28
('noun.possession', 'noun.quantity') 1 / 28
('noun.Tops', 'noun.location') 1 / 28
('noun.cognition', 'noun.communication') 1 / 28
('noun.person', 'noun.quantity') 1 / 28
('noun.substance', 'noun.possession') 1 / 28
('noun.substance', 'noun.quantity') 2 / 28
('noun.substance', 'verb.possession') 1 / 28
('noun.substance', 'noun.event') 1 / 28
('noun.person', 'noun.substance') 1 / 28
('noun.person', 'noun.animal') 1 / 28
('noun.communication', 'noun.artifact') 1 / 28
('noun.Tops', 'noun.communication') 1 / 28
('noun.event', 'noun.act') 1 / 28
('noun.substance', 'noun.artifact') 3 / 28
('noun.person', 'noun.cognition') 1 / 28
('noun.person', 'noun.act') 1 / 28
('noun.person', 'noun.artifact') 4 / 28
('noun.person', 'noun.food') 1 / 28
