**Asignment**: Senseval-2 system. Due October 7.
1. Describe your method
* Analyse the errors
* Include data and examples
* Suggest how to improve the algorithm

# Data preparation

In [1]:
import nltk
from nltk.corpus import senseval as se

x = se.instances("hard.pos")
ex = x[20:220]+x[3500:3550]+x[4100:4150]    #subset of 300 examples
s = set()
w = set()
f_text = lambda s : " ".join(next(zip(*s.context)))
for sent in ex:
    sense = sent.senses[0]
    s.add(sense)
    w.add(sent.word)

print(s)
print(w)
print(x[20].senses[0], f_text(x[20]))
print(x[3500].senses[0], f_text(x[3500]))
print(x[4100].senses[0], f_text(x[4100]))
del(s, x, w)

{'HARD1', 'HARD2', 'HARD3'}
{'hard-a'}
HARD1 the slump in business activity and the accompanying `` credit crunch '' have been especially hard on owners of these high-yielding , high-risk securities , whether they owned them directly or through mutual funds .
HARD2 he said tuesday there are no hard feelings .
HARD3 `` it 's very difficult to sell women 's apparel in a store whose reputation is hard goods , '' said monroe greenstein , an analyst at bear , stearns & co .


In [27]:
k = 250
print(ex[k].senses[0], ex[k].word, f_text(ex[k]))

HARD3 hard-a `` it 's very difficult to sell women 's apparel in a store whose reputation is hard goods , '' said monroe greenstein , an analyst at bear , stearns & co .


## Wordnet test

In [15]:
import nltk
from nltk.corpus import wordnet as wd

word = "hard"
w = wd.morphy(word)
if w:
  lemmas = wd.lemmas(w)
  for l in lemmas:
    syn = l.synset()
    print(syn.name())
    print(syn.definition())
    print
else:
  print("WordNet has no base form for", word)

difficult.a.01
not easy; requiring great physical or mental effort to accomplish or comprehend or endure
hard.a.02
dispassionate; 
hard.a.03
resisting weight or pressure
hard.s.04
very strong or vigorous
arduous.s.01
characterized by effort to the point of exhaustion; especially physical effort
unvoiced.a.01
produced without vibration of the vocal cords
hard.a.07
(of light) transmitted directly from a pointed light source
hard.a.08
(of speech sounds); produced with the back of the tongue raised toward or touching the velum
intemperate.s.03
given to excessive indulgence of bodily appetites especially for intoxicating liquors
hard.s.10
being distilled rather than fermented; having a high alcoholic content
hard.s.11
unfortunate or hard to bear
hard.s.12
dried out
hard.r.01
with effort or force or vigor
hard.r.02
with firmness
hard.r.03
earnestly or intently
hard.r.04
causing great damage or hardship
hard.r.05
slowly and with difficulty
heavily.r.07
indulging excessively
hard.r.07
into a s

In [None]:
# syn_key = 'hard.a.07'
# syn_key = 'unvoiced.a.01'
# syn_key = 'hard.s.11'
syn_key = 'hard.a.03'
s = wd.synset(syn_key)
print(s.definition())
print(s.examples())

## Util functions

In [10]:
from nltk.corpus import stopwords

def clear_sent(sentence):
    # Stop words
    unwanted = set(nltk.corpus.stopwords.words("english"))
    # Symbols
    unwanted.update(list('!"#$%&\'()*+,-./:;<=>? @[\\]^_`{|}~£'))
    unwanted.update(["''"])
    unwanted.update(["``"])
    unwanted.update(['""'])

    toks = [w.lower() for w in (sentence if isinstance(sentence, list) \
                    else sentence.split()) if w not in unwanted]
    return toks  

SENSE_MAP = {
    "HARD1": ["difficult.a.01", "hard.s.11"],
    "HARD2": ["hard.a.02", "difficult.a.01"],
    "HARD3": ["hard.a.03"],   
}
def from_eval_to_sense(s):
    if s in SENSE_MAP:
        return SENSE_MAP[s]
    return s

# Unsupervised algortihms evaluation

### Little test of lesk usage

In [16]:
def test(c):
    context = next(zip(*c.context))
    s = " ".join(context)
    print(s)
    print("_real: %s - %s\n with full: %s\n with filtered: %s" % 
      (c.senses, from_eval_to_sense(c.senses[0]),
       lesk(s, 'hard',pos='a'), lesk(clear_sent(s), 'hard',pos='a')))
    
k = 10
for i in range(k,k+5):
    print('-----%d' % i)
    test(ex[i])

-----10
`` i have a hard time with the use of the word . ''
_real: ('HARD1',) - ['difficult.a.01', 'hard.s.11']
 with full: Synset('hard.a.07')
 with filtered: Synset('unvoiced.a.01')
-----11
`` when you have a disaster like the oakland hills fire , it 's hard to remember everything .
_real: ('HARD1',) - ['difficult.a.01', 'hard.s.11']
 with full: Synset('hard.a.07')
 with filtered: Synset('unvoiced.a.01')
-----12
`` it was a unique project , a high-risk project , and it was put in such an unfortunate time slot ( opposite '48 hours ' and 'quantum leap ' ) that it would be hard to get anyone interested in it .
_real: ('HARD1',) - ['difficult.a.01', 'hard.s.11']
 with full: Synset('hard.a.07')
 with filtered: Synset('unvoiced.a.01')
-----13
`` four generations of our family have come here , and every year , it gets harder to get in . ''
_real: ('HARD1',) - ['difficult.a.01', 'hard.s.11']
 with full: Synset('hard.a.07')
 with filtered: Synset('unvoiced.a.01')
-----14
all the easy cuts and

## Lesk improvement

In [4]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import wordnet as wd

# as more as we could google theme
functionwords = ['about', 'across', 'against', 'along', 'around', 'at',
                 'behind', 'beside', 'besides', 'by', 'despite', 'down',
                 'during', 'for', 'from', 'in', 'inside', 'into', 'near', 'of',
                 'off', 'on', 'onto', 'over', 'through', 'to', 'toward',
                 'with', 'within', 'without', 'anything', 'everything',
                 'anyone', 'everyone', 'ones', 'such', 'it', 'itself',
                 'something', 'nothing', 'someone', 'the', 'some', 'this',
                 'that', 'every', 'all', 'both', 'one', 'first', 'other',
                 'next', 'many', 'much', 'more', 'most', 'several', 'no', 'a',
                 'an', 'any', 'each', 'no', 'half', 'twice', 'two', 'second',
                 'another', 'last', 'few', 'little', 'less', 'least', 'own',
                 'and', 'but', 'after', 'when', 'as', 'because', 'if', 'what',
                 'where', 'which', 'how', 'than', 'or', 'so', 'before', 'since',
                 'while', 'although', 'though', 'who', 'whose', 'can', 'may',
                 'will', 'shall', 'could', 'be', 'do', 'have', 'might', 'would',
                 'should', 'must', 'here', 'there', 'now', 'then', 'always',
                 'never', 'sometimes', 'usually', 'often', 'therefore',
                 'however', 'besides', 'moreover', 'though', 'otherwise',
                 'else', 'instead', 'anyway', 'incidentally', 'meanwhile']

def overlapcontext( synset, sentence ):
    gloss = TreebankWordTokenizer().tokenize(synset.definition())
    gloss = set(clear_sent(gloss))
    for i in synset.examples():
         gloss.union(i)
    gloss = gloss.difference( functionwords )
    if isinstance(sentence, list):
        sentence = set(sentence)
    else:
        return
    sentence = sentence.difference( functionwords )
    return len( gloss.intersection(sentence) )

def lesk2( sentence, word, pos = None):
    bestsense = None
    maxoverlap = 0
    word=wd.morphy(word, pos=pos) if wd.morphy(word) is not None else word
    for sense in wd.synsets(word, pos=pos):
        overlap = overlapcontext(sense,sentence)
        for h in sense.hyponyms():
            overlap += overlapcontext( h, sentence )
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = sense
    return bestsense

### Little test of lesk updated

In [44]:
def test2(c):
    context = next(zip(*c.context))
    s = " ".join(context)
    print(s)
    print("_real: %s - %s\n with full: %s\n with filtered: %s" % 
      (c.senses, from_eval_to_sense(c.senses[0]),
       lesk2('hard',s), lesk2('hard', clear_sent(s))))
    
k = 260
for i in range(k,k+5):
    print('-----%d' % i)
    test2(ex[i])

## Manual check lesk

In [11]:
from ipywidgets import widgets
from IPython.display import display

sentence_input = widgets.Text(description="sentence")
word_input = widgets.Text(description="word")

display(sentence_input)
display(word_input)


btn = widgets.Button(description="submit text inputs")
display(btn)

def on_btn_click(e):
    word = word_input.value
    sentence = sentence_input.value
    
    sentence = TreebankWordTokenizer().tokenize(sentence)
    sentence = clear_sent(sentence)
    print(word, sentence)
    res = lesk2(word, sentence)
    print(res)
    
btn.on_click(on_btn_click)

## Automated test of lesk

In [13]:
from nltk.wsd import lesk

def clarify(x):
    s = f_text(x)
    s = clear_sent(s)
    s = 
#     return lesk2(s, 'hard', pos='a')
    return lesk(s, 'hard', pos='a')
    
res = []
for x in ex:
    y1 = clarify(x)
    y = from_eval_to_sense(x.senses[0])
    if y1:
        r = y1.name() in y
        res.append(r)
#         print(r,y1.name(),y)
#     else:
#         print(y1)

if True:
    t = len(list(filter(lambda x : x, res)))
    f = len(res) - t
    n = len(ex)
    prec = t/len(res)
    rec = t/n
    print("Precision =", prec)
    print("Recall =", rec)
    print("F1-score =", prec*rec/(prec+rec))

Precision = 0.0
Recall = 0.0


ZeroDivisionError: float division by zero

## Results

### Lesk

#### pos=None
- Precision = 0.5066666666666667
- Recall = 0.5066666666666667
- F1-score = 0.25333333333333335

#### pos='a'
- Precision = 0.0033333333333333335
- Recall = 0.0033333333333333335
- F1-score = 0.001666666666666667

### Lesk2

#### pos=None
- Precision = 0.5559701492537313
- Recall = 0.49666666666666665
- F1-score = 0.2623239436619718

#### pos='a'
- Precision = 0.5665399239543726
- Recall = 0.49666666666666665
- F1-score = 0.26465364120781526