In [196]:
import pandas as pd
from lxml import html
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from pymorphy2 import MorphAnalyzer
from string import punctuation
import json, os
from collections import Counter
import numpy as np
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

In [198]:
#сегментируем текст на предложения 
sent_text = [nltk.sent_tokenize(text) for text in open('corpus_eng.txt')]

In [199]:
#выбираем из всех предложений только те, в которых есть слово 'break'
sent_w_target_word = []
for lst in sent_text:
    for st in lst:
        st = st.split()
        if 'break' in st:
            sent_w_target_word.append(st)


In [42]:
#рандомно выбираем 10 разных предложений, содержащих слово 'break'
import random
ten_sent_w_target_word = []
for i in range(10):
    rand_sent = random.choice(sent_w_target_word)
    if rand_sent not in ten_sent_w_target_word:
        ten_sent_w_target_word.append(rand_sent)
        
for c in ten_sent_w_target_word:
    print(c)

['Those', 'who', 'break', 'the', 'law', 'are', 'punished', 'by', 'public', 'caning.']
['For', 'the', 'lucky,', 'Thanksgiving', 'will', 'be', 'a', 'much-needed', 'break', 'from', 'the', 'relentless', 'news', 'cycle.']
['On', 'Monday’s', 'Dancing', 'with', 'the', 'Stars', ',', 'Olympic', 'gymnast', 'Laurie', 'Hernandez', 'took', 'a', 'break', 'from', 'her', 'rehearsal', 'schedule', 'to', 'visit', 'her', 'ailing', 'grandmother.']
['If', 'you', 'treat', 'your', 'lover', 'badly,', "she's", 'going', 'to', 'break', 'up', 'with', 'you."']
['Taking', 'advantage', 'of', 'this', 'break', 'from', 'campaigning,', 'Knox', 'returned', 'to', 'Massachusetts', 'with', 'the', 'goal', 'of', 'improving', 'weapons', 'production.']
['"It', 'is', 'always', 'difficult', 'to', 'play', 'after', 'a', 'Europa', '[League]', 'game', 'because', 'of', 'the', '[short]', 'break', 'you', 'get', 'after', 'the', 'game,"', 'he', 'said.']
['About', '31,000', 'Tumblekins', 'toys', 'are', 'being', 'recalled', 'by', 'Internatio

In [29]:
for synset in wn.synsets('break'):
    print('--', synset, '--', synset.definition(), synset.examples())

-- Synset('interruption.n.02') -- some abrupt occurrence that interrupts an ongoing activity ['the telephone is an annoying interruption', 'there was a break in the action when a player was hurt']
-- Synset('break.n.02') -- an unexpected piece of good luck ['he finally got his big break']
-- Synset('fault.n.04') -- (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other ['they built it right over a geological fault', "he studied the faulting of the earth's crust"]
-- Synset('rupture.n.02') -- a personal or social separation (as between opposing factions) ['they hoped to avoid a break in relations']
-- Synset('respite.n.02') -- a pause from doing something (as work) ['we took a 10-minute break', 'he took time out to recuperate']
-- Synset('breakage.n.03') -- the act of breaking something ['the breakage was unavoidable']
-- Synset('pause.n.01') -- a time interval during which there is a temporary cessation of something []
-- Synset('fr

In [22]:
for i, c in enumerate(ten_sent_w_target_word):
    print(i+1,' '.join(c))

1 Those who break the law are punished by public caning.
2 For the lucky, Thanksgiving will be a much-needed break from the relentless news cycle.
3 On Monday’s Dancing with the Stars , Olympic gymnast Laurie Hernandez took a break from her rehearsal schedule to visit her ailing grandmother.
4 If you treat your lover badly, she's going to break up with you."
5 Taking advantage of this break from campaigning, Knox returned to Massachusetts with the goal of improving weapons production.
6 "It is always difficult to play after a Europa [League] game because of the [short] break you get after the game," he said.
7 About 31,000 Tumblekins toys are being recalled by International Playthings as the toy can break into small pieces with sharp edges, posing a laceration hazard. More » ©Landscape Structures
8 Every year, workers on both sides of the camera are maimed, burned, break bones and even die striving to deliver entertainment that packs multiplexes and commands top TV ratings.
9 Around 7 

Попытаемся определить верные значения (создадим подобие золотого стандарта):
    1. Synset('transgress.v.01')
    2. Synset('pause.n.01')
    3. Synset('respite.n.02')
    4. Synset('separate.v.08')
    5. Synset('pause.n.01'), Synset('interruption.n.02')
    6. Synset('pause.n.01'), Synset('respite.n.02')
    7. Synset('break.v.02')
    8. Synset('fracture.v.06') 
    9. Synset('separate.v.08')
    10. Synset('pause.n.01'), Synset('respite.n.02')

Для начала определим стоп-слова, от которых будем очищать предложения и определения-примеры

In [31]:
stopwords = ['a', 'an', 'any', 'is', 'and', 'about', 'across', 'against', 'along', 'around', 'at','behind', 'beside', 'besides', 'by', 'despite', 'down',
            'ourselves', 'during', 'finally', 'for', 'from', 'in', 'inside', 'into', 'near', 'of', 'off', 'on', 'onto', 'over', 'through', 'to', 'toward',
            'with', 'within', 'without', 'anything', 'everything', 'anyone', 'everyone', 'ones', 'such', 'it', 'itself', 'nor'
            'something', 'nothing', 'some', 'someone', 'the', 'this', 'that', 'every', 'all', 'both', 'one', 'first', 'other',
            'next', 'many', 'much', 'more', 'most', 'several', 'no', 'each', 'half', 'twice', 'two', 'second',
            'another', 'last', 'few', 'little', 'less', 'least', 'own', 'but', 'after', 'when', 'as', 'because', 'if', 'what',
            'we','where', 'which', 'how', 'than', 'or', 'so', 'before', 'since', 'while', 'although', 'though', 'who', 'whose', 'can', 'may',
            'will', 'shall', 'could', 'be', 'do', 'have', 'might', 'would', 'should', 'must', 'here', 'there', 'now', 'then', 'always',
            'never', 'sometimes', 'usually', 'often', 'therefore', 'however', 'besides', 'moreover', 'though', 'otherwise',
            'else', 'instead', 'anyway', 'incidentally', 'meanwhile','i']


Если честно, алгоритм Леска, который был предложен на паре, работал из рук вон плохо: во время тестирования он не определил верно значение слова 'break' ни в одном предложении -- я решила его переписать.

### Эксперименты:

- пробую исключить стоп-слова
- в качестве контекста я использую не заданное окно, а всё предложение 
- сравниваю контекст слова не только с определениями синсетов, но и с примерами их употребления
- включаю контексты не только значений слова, но и определение и примеры употреблений гипонимов каждого из значений
- подключаю лемматизацию
- пробую сравнивать контексты с учётом части речи

### Лучший результат: 4(8)/10

### Проблемы, с которыми связано использование алгоритма Леска для слова 'break':

- слово многозначно (75 определений в WordNet!)
- to break - фразовый глагол (значение меняется в засимости от разнообразных предлогов), а в WordNet'е многие из его значений не представлены
- употребляется в самых разных контекстах, в том числе специфических - от геологии до спорта
- в WordNet'е содержатся весьма короткие определения, примеры употребления тоже коротки и их немного (у некоторых значений и вовсе нет) 
- невелика вероятность того, что контексты слова из случайной выборки совпадут с определениями и ограниченным кругом примеров употребления из WordNet
- нет возможности выделить pos-тег из леммы слова, а в определениях и примерах само слово может даже не употребляться - это ведёт к затруднениям при использования части речи в качестве ограничивающего признака 

## Алгоритм Леска

### Контекст - всё предложение, исключение стоп-слов, контекст слова сравнивается с определениями и примерами употребления каждого из значений

In [35]:
def context_intersection(synset, sentence):
    cont_def = set(word_tokenize(synset.definition()))
    cont_def = cont_def.difference(stopwords)
    cont_ex = set(word_tokenize(' '.join(synset.examples())))
    cont_ex = cont_ex.difference(stopwords)
    cont_synset = cont_ex.union(cont_def)
    
    sentence = set(sentence)
    sentence = sentence.difference(stopwords)
    return len(cont_synset.intersection(sentence))

def lesk(word, sentence):
    bestsense = None
    maxoverlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    for meaning in wn.synsets(word):
        overlap = context_intersection(meaning,sentence)
        #for hyp in meaning.hyponyms():
            #overlap += context_intersection(hyp, sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = meaning
    return bestsense
    
    
word = 'break'

for c in ten_sent_w_target_word:
    a = lesk(word, c)
    print('--', a, '--', ' '.join(c))

-- Synset('transgress.v.01') -- Those who break the law are punished by public caning.
-- Synset('interruption.n.02') -- For the lucky, Thanksgiving will be a much-needed break from the relentless news cycle.
-- Synset('respite.n.02') -- On Monday’s Dancing with the Stars , Olympic gymnast Laurie Hernandez took a break from her rehearsal schedule to visit her ailing grandmother.
-- Synset('break.n.12') -- If you treat your lover badly, she's going to break up with you."
-- Synset('interruption.n.02') -- Taking advantage of this break from campaigning, Knox returned to Massachusetts with the goal of improving weapons production.
-- Synset('break.n.02') -- "It is always difficult to play after a Europa [League] game because of the [short] break you get after the game," he said.
-- Synset('interruption.n.02') -- About 31,000 Tumblekins toys are being recalled by International Playthings as the toy can break into small pieces with sharp edges, posing a laceration hazard. More » ©Landscape 

1. **идеальное попадание**
2. **очень близко по смыслу**
3. **идеальное попадание**
4. мимо
5. **идеальное попадание**
6. **очень близко по смыслу** 
7. мимо
8. мимо
9. мимо
10. **идеальное попадание**

**Итого 4(6)/10**

Значение 'interruption.n.02' весьма популярно, потому что у него присутствует сам глагол 'break' в определениях; 
4, 7, 8, 9 - в этих предложениях есть слова, не имеющие отношения к получившимся значениям слова break, но встречающиеся в определениях/примерах.
Контексты гораздо более специфичные и разнообразные, чем примеры в WordNet (а у некоторых значений примеров вообще нет!)

### + включаю контексты не только значений слова, но и определение и примеры употреблений гипонимов каждого из значений

In [36]:
def context_intersection(synset, sentence):
    cont_def = set(word_tokenize(synset.definition()))
    cont_def = cont_def.difference(stopwords)
    cont_ex = set(word_tokenize(' '.join(synset.examples())))
    cont_ex = cont_ex.difference(stopwords)
    cont_synset = cont_ex.union(cont_def)
    
    sentence = set(sentence)
    sentence = sentence.difference(stopwords)
    return len(cont_synset.intersection(sentence))

def lesk(word, sentence):
    bestsense = None
    maxoverlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    for meaning in wn.synsets(word):
        overlap = context_intersection(meaning,sentence)
        for hyp in meaning.hyponyms():
            overlap += context_intersection(hyp, sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = meaning
    return bestsense
    

word = 'break'

for c in ten_sent_w_target_word:
    a = lesk(word, c)
    print('--', a, '--', ' '.join(c))

-- Synset('transgress.v.01') -- Those who break the law are punished by public caning.
-- Synset('break.v.02') -- For the lucky, Thanksgiving will be a much-needed break from the relentless news cycle.
-- Synset('break.v.02') -- On Monday’s Dancing with the Stars , Olympic gymnast Laurie Hernandez took a break from her rehearsal schedule to visit her ailing grandmother.
-- Synset('separate.v.08') -- If you treat your lover badly, she's going to break up with you."
-- Synset('break.v.02') -- Taking advantage of this break from campaigning, Knox returned to Massachusetts with the goal of improving weapons production.
-- Synset('pause.n.01') -- "It is always difficult to play after a Europa [League] game because of the [short] break you get after the game," he said.
-- Synset('break.v.05') -- About 31,000 Tumblekins toys are being recalled by International Playthings as the toy can break into small pieces with sharp edges, posing a laceration hazard. More » ©Landscape Structures
-- Synset

1. **идеальное попадание**
2. мимо
3. мимо
4. **идеальное попадание**
5. мимо
6. **идеальное попадание**
7. **очень близко по смыслу** 
8. **очень близко по смыслу** (не та часть речи)
9. **идеальное попадание**
10. **очень близко по смыслу** (не та часть речи)

**Итого 4(7)/10**

Очень любопытный результат: с учётом гипонимов алгоритм исправил предыдущие ошибки, но 'слетел' на том, что было неплохо определено до этого. Если бы губы Никанора Ивановича да приставить к носу Ивана Кузьмича... Видно, что сейчас превосходно определяется значение 'separate.v.08' (то есть связанное с расставанием, разрывом отношений): оба случая пойманы. Все грубые ошибки связаны с выдачей значения 'break.v.02'(у него довольно много гипонимов, у которых, в свою очередь есть несколько примеров употребления) - алгоритм цепляется за другие слова. 8, 10 - результат, очень близкий к правде (в 8 должно быть 'fracture.v.06' - получили 'fracture.n.01'; в 10 должно быть 'pause.n.01' - получили 'pause.v.02'), лишь часть речи определена неверно (возможно, поможет pos-теггинг). Стоит еще ввести лемматизацию, так как в некоторых примерах употребление есть 'broke' - возможно, приведя всё к одной форме, мы выровняем баланс значений.

### Пробую подключить лемматизацию

In [None]:
#создаём функцию для лемматизации
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

def normalize(text):
    
    words = [word.strip(punct).lower() for word in text]
    w = [wl.lemmatize(word) for word in words if word]

    return w

In [44]:
def context_intersection(synset, sentence):
    cont_def = normalize(word_tokenize(synset.definition()))
    cont_def = set(cont_def)
    cont_def = cont_def.difference(stopwords)
    cont_ex = normalize(word_tokenize(' '.join(synset.examples())))
    cont_ex = set(cont_ex)
    cont_ex = cont_ex.difference(stopwords)
    cont_synset = cont_ex.union(cont_def)
    
    sentence = normalize(sentence)
    sentence = set(sentence)
    sentence = sentence.difference(stopwords)
    return len(cont_synset.intersection(sentence))

def lesk(word, sentence):
    bestsense = None
    maxoverlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    for meaning in wn.synsets(word):
        overlap = context_intersection(meaning,sentence)
        for hyp in meaning.hyponyms():
            overlap += context_intersection(hyp, sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = meaning
    return bestsense
    

word = 'break'

for c in ten_sent_w_target_word:
    a = lesk(word, c)
    print('--', a, '--', ' '.join(c))

-- Synset('transgress.v.01') -- Those who break the law are punished by public caning.
-- Synset('break.v.02') -- For the lucky, Thanksgiving will be a much-needed break from the relentless news cycle.
-- Synset('pause.v.02') -- On Monday’s Dancing with the Stars , Olympic gymnast Laurie Hernandez took a break from her rehearsal schedule to visit her ailing grandmother.
-- Synset('separate.v.08') -- If you treat your lover badly, she's going to break up with you."
-- Synset('break.v.02') -- Taking advantage of this break from campaigning, Knox returned to Massachusetts with the goal of improving weapons production.
-- Synset('break.v.05') -- "It is always difficult to play after a Europa [League] game because of the [short] break you get after the game," he said.
-- Synset('break.v.05') -- About 31,000 Tumblekins toys are being recalled by International Playthings as the toy can break into small pieces with sharp edges, posing a laceration hazard. More » ©Landscape Structures
-- Synset

1. **идеальное попадание**
2. мимо
3. **очень близко по смыслу** (не та часть речи)
4. **идеальное попадание**
5. мимо
6. мимо
7. **очень близко по смыслу** 
8. **очень близко по смыслу** (не та часть речи)
9. **идеальное попадание**
10. **очень близко по смыслу** (не та часть речи)

**Итого 3(7)/10**

2, 5 - снова ошибка, связанная с выдачей значения 'break.v.02'(у него довольно много гипонимов, у которых, в свою очередь есть несколько примеров употребления) - алгоритм цепляется за другие слова. 6, 7 - снова алгоритм цепляется за пересекающиеся слова, которые, тем не менее не соответствуют значению 'break' (вновь упираемся в ограниченность примеров и определений WordNet'а): 'difficult', 'pieces', 'sharp', 'game' действительно есть в 'break.v.05' и его гиперонимах - увы, значение определить они не помогают. В остальных случаях ошибки связаны, в основном, с неверно определённой частью речи.

### С лемматизацией, но без включения гипонимов

In [45]:
def context_intersection(synset, sentence):
    cont_def = normalize(word_tokenize(synset.definition()))
    cont_def = set(cont_def)
    cont_def = cont_def.difference(stopwords)
    cont_ex = normalize(word_tokenize(' '.join(synset.examples())))
    cont_ex = set(cont_ex)
    cont_ex = cont_ex.difference(stopwords)
    cont_synset = cont_ex.union(cont_def)
    
    sentence = normalize(sentence)
    sentence = set(sentence)
    sentence = sentence.difference(stopwords)
    return len(cont_synset.intersection(sentence))

def lesk(word, sentence):
    bestsense = None
    maxoverlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    for meaning in wn.synsets(word):
        overlap = context_intersection(meaning,sentence)
        #for hyp in meaning.hyponyms():
            #overlap += context_intersection(hyp, sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = meaning
    return bestsense
    

word = 'break'

for c in ten_sent_w_target_word:
    a = lesk(word, c)
    print('--', a, '--', ' '.join(c))

-- Synset('transgress.v.01') -- Those who break the law are punished by public caning.
-- Synset('interrupt.v.04') -- For the lucky, Thanksgiving will be a much-needed break from the relentless news cycle.
-- Synset('respite.n.02') -- On Monday’s Dancing with the Stars , Olympic gymnast Laurie Hernandez took a break from her rehearsal schedule to visit her ailing grandmother.
-- Synset('break.n.12') -- If you treat your lover badly, she's going to break up with you."
-- Synset('interruption.n.02') -- Taking advantage of this break from campaigning, Knox returned to Massachusetts with the goal of improving weapons production.
-- Synset('break.n.12') -- "It is always difficult to play after a Europa [League] game because of the [short] break you get after the game," he said.
-- Synset('break.n.02') -- About 31,000 Tumblekins toys are being recalled by International Playthings as the toy can break into small pieces with sharp edges, posing a laceration hazard. More » ©Landscape Structures

1. **идеальное попадание**
2. мимо
3. **идеальное попадание**
4. мимо
5. **идеальное попадание**
6. мимо
7. мимо
8. **очень близко по смыслу** (не та часть речи)
9. мимо
10. **идеальное попадание**

**Итого 4(5)/10**

И, вроде бы, точных попаданий стало на 1 больше, однако улавливание общего смысла (если не учитывать часть речи) ухудшилось. 

### Пробую учитывать часть речи 

In [62]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dariabakshandaeva/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [189]:
#тегируем наши 10 предложений
sent_pos = []
for i in ten_sent_w_target_word:
    i_pos = nltk.pos_tag(i)
    sent_pos.append(i_pos)

In [191]:
def break_pos_tag(s):

    sent_break_pos = []

    for word in s:
        if word[0].lower() == 'break':
            sent_break_pos.append(word[0].lower() +'_'+ word[1])
        else:
            sent_break_pos.append(word[0].lower())
    return sent_break_pos

#оставляем теги только у слова 'break'
ten_sent_break_pos = []
for i in sent_pos:
    i_pos = break_pos_tag(i)
    ten_sent_break_pos.append(i_pos)
    
ten_sent_break_pos


[['those',
  'who',
  'break_VBP',
  'the',
  'law',
  'are',
  'punished',
  'by',
  'public',
  'caning.'],
 ['for',
  'the',
  'lucky,',
  'thanksgiving',
  'will',
  'be',
  'a',
  'much-needed',
  'break_NN',
  'from',
  'the',
  'relentless',
  'news',
  'cycle.'],
 ['on',
  'monday’s',
  'dancing',
  'with',
  'the',
  'stars',
  ',',
  'olympic',
  'gymnast',
  'laurie',
  'hernandez',
  'took',
  'a',
  'break_NN',
  'from',
  'her',
  'rehearsal',
  'schedule',
  'to',
  'visit',
  'her',
  'ailing',
  'grandmother.'],
 ['if',
  'you',
  'treat',
  'your',
  'lover',
  'badly,',
  "she's",
  'going',
  'to',
  'break_VB',
  'up',
  'with',
  'you."'],
 ['taking',
  'advantage',
  'of',
  'this',
  'break_NN',
  'from',
  'campaigning,',
  'knox',
  'returned',
  'to',
  'massachusetts',
  'with',
  'the',
  'goal',
  'of',
  'improving',
  'weapons',
  'production.'],
 ['"it',
  'is',
  'always',
  'difficult',
  'to',
  'play',
  'after',
  'a',
  'europa',
  '[league]',
  '

In [192]:
#тегируем слово 'break' в определениях и примерах употребления синсетов, потом применяем алгоритм Леска 
def context_intersection(synset, sentence):
    cont_def = normalize(word_tokenize(synset.definition()))
    cont_def = break_tag(nltk.pos_tag(cont_def))
    cont_def = set(cont_def)
    cont_def = cont_def.difference(stopwords)
    cont_ex = normalize(word_tokenize(' '.join(synset.examples())))
    cont_ex = break_tag(nltk.pos_tag(cont_ex))
    cont_ex = set(cont_ex)
    cont_ex = cont_ex.difference(stopwords)
    cont_synset = cont_ex.union(cont_def)
    
    sentence = normalize(sentence)
    sentence = set(sentence)
    sentence = sentence.difference(stopwords)
    return len(cont_synset.intersection(sentence))

def lesk(word, sentence):
    bestsense = None
    maxoverlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    for meaning in wn.synsets(word):
        overlap = context_intersection(meaning,sentence)
        for hyp in meaning.hyponyms():
            overlap += context_intersection(hyp, sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = meaning
    return bestsense
    

word = 'break'

for c in ten_sent_break_pos:
    a = lesk(word, c)
    print('--', a, '--', ' '.join(c))

-- Synset('transgress.v.01') -- those who break_VBP the law are punished by public caning.
-- Synset('unwrap.v.02') -- for the lucky, thanksgiving will be a much-needed break_NN from the relentless news cycle.
-- Synset('pause.v.02') -- on monday’s dancing with the stars , olympic gymnast laurie hernandez took a break_NN from her rehearsal schedule to visit her ailing grandmother.
-- Synset('separate.v.08') -- if you treat your lover badly, she's going to break_VB up with you."
-- Synset('break.n.14') -- taking advantage of this break_NN from campaigning, knox returned to massachusetts with the goal of improving weapons production.
-- Synset('pause.n.01') -- "it is always difficult to play after a europa [league] game because of the [short] break_NN you get after the game," he said.
-- Synset('break.v.05') -- about 31,000 tumblekins toys are being recalled by international playthings as the toy can break_VB into small pieces with sharp edges, posing a laceration hazard. more » ©landsca

1. **идеальное попадание**
2. мимо
3. **очень близко по смыслу** (не та часть речи)
4. **идеальное попадание**
5. мимо
6. **идеальное попадание**
7. **очень близко по смыслу**
8. **очень близко по смыслу** (не та часть речи - pos-тэггер неверно определил, в том числе) 
9. **идеальное попадание**
10. **очень близко по смыслу** (не та часть речи)

**Итого 4(8)/10**

В WordNet нет возможности выделить pos-тег из леммы слова, а в определениях и примерах само слово может даже не употребляться - это ведёт к затруднениям при использования части речи в качестве ограничивающего признака. Теггер NLTK, к тому же, не идеален - мы имеем возможность наблюдать его ошибку в случае с 'break bones'. В общем, с проблемой несоответствия части речи это нам мало помогло. 

### Учитываю часть речи, не исключаю стоп-слова, учитываю гипонимы

In [193]:
#тегируем слово 'break' в определениях и примерах употребления синсетов, потом применяем алгоритм Леска 
def context_intersection(synset, sentence):
    cont_def = normalize(word_tokenize(synset.definition()))
    cont_def = break_tag(nltk.pos_tag(cont_def))
    cont_def = set(cont_def)
    #cont_def = cont_def.difference(stopwords)
    cont_ex = normalize(word_tokenize(' '.join(synset.examples())))
    cont_ex = break_tag(nltk.pos_tag(cont_ex))
    cont_ex = set(cont_ex)
    #cont_ex = cont_ex.difference(stopwords)
    cont_synset = cont_ex.union(cont_def)
    
    sentence = normalize(sentence)
    sentence = set(sentence)
    #sentence = sentence.difference(stopwords)
    return len(cont_synset.intersection(sentence))

def lesk(word, sentence):
    bestsense = None
    maxoverlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    for meaning in wn.synsets(word):
        overlap = context_intersection(meaning,sentence)
        for hyp in meaning.hyponyms():
            overlap += context_intersection(hyp, sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = meaning
    return bestsense
    

word = 'break'

for c in ten_sent_break_pos:
    a = lesk(word, c)
    print('--', a, '--', ' '.join(c))

-- Synset('fracture.n.01') -- those who break_VBP the law are punished by public caning.
-- Synset('pause.n.01') -- for the lucky, thanksgiving will be a much-needed break_NN from the relentless news cycle.
-- Synset('fracture.n.01') -- on monday’s dancing with the stars , olympic gymnast laurie hernandez took a break_NN from her rehearsal schedule to visit her ailing grandmother.
-- Synset('separate.v.08') -- if you treat your lover badly, she's going to break_VB up with you."
-- Synset('fracture.n.01') -- taking advantage of this break_NN from campaigning, knox returned to massachusetts with the goal of improving weapons production.
-- Synset('pause.n.01') -- "it is always difficult to play after a europa [league] game because of the [short] break_NN you get after the game," he said.
-- Synset('break.v.02') -- about 31,000 tumblekins toys are being recalled by international playthings as the toy can break_VB into small pieces with sharp edges, posing a laceration hazard. more » ©land

### Учитываю часть речи, не исключаю стоп-слова, не учитываю гипонимы

In [194]:
#тегируем слово 'break' в определениях и примерах употребления синсетов, потом применяем алгоритм Леска 
def context_intersection(synset, sentence):
    cont_def = normalize(word_tokenize(synset.definition()))
    cont_def = break_tag(nltk.pos_tag(cont_def))
    cont_def = set(cont_def)
    #cont_def = cont_def.difference(stopwords)
    cont_ex = normalize(word_tokenize(' '.join(synset.examples())))
    cont_ex = break_tag(nltk.pos_tag(cont_ex))
    cont_ex = set(cont_ex)
    #cont_ex = cont_ex.difference(stopwords)
    cont_synset = cont_ex.union(cont_def)
    
    sentence = normalize(sentence)
    sentence = set(sentence)
    #sentence = sentence.difference(stopwords)
    return len(cont_synset.intersection(sentence))

def lesk(word, sentence):
    bestsense = None
    maxoverlap = 0
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    for meaning in wn.synsets(word):
        overlap = context_intersection(meaning,sentence)
        #for hyp in meaning.hyponyms():
            #overlap += context_intersection(hyp, sentence)
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = meaning
    return bestsense
    

word = 'break'

for c in ten_sent_break_pos:
    a = lesk(word, c)
    print('--', a, '--', ' '.join(c))

-- Synset('fracture.n.01') -- those who break_VBP the law are punished by public caning.
-- Synset('interrupt.v.04') -- for the lucky, thanksgiving will be a much-needed break_NN from the relentless news cycle.
-- Synset('fault.n.04') -- on monday’s dancing with the stars , olympic gymnast laurie hernandez took a break_NN from her rehearsal schedule to visit her ailing grandmother.
-- Synset('fault.n.04') -- if you treat your lover badly, she's going to break_VB up with you."
-- Synset('fault.n.04') -- taking advantage of this break_NN from campaigning, knox returned to massachusetts with the goal of improving weapons production.
-- Synset('fault.n.04') -- "it is always difficult to play after a europa [league] game because of the [short] break_NN you get after the game," he said.
-- Synset('break.v.05') -- about 31,000 tumblekins toys are being recalled by international playthings as the toy can break_VB into small pieces with sharp edges, posing a laceration hazard. more » ©landscape

Почти всё захватило значение 'fault.n.04' (по всей видимости, с ним помогали справиться стоп-слова). Кажется, пора остановиться, пока не провалились в 'a crack in the earth's crust'.