In [27]:
import adagram
import gensim
import pandas as pd
from lxml import html
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from pymorphy2 import MorphAnalyzer
from string import punctuation
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('english'))

In [253]:
def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split() if word and word not in stops]
    words = [morph.parse(word)[0].normal_form for word in words if word]

    return words

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split() if word and word not in stops]
    words = [word for word in words if word]

    return words

In [5]:
with open("corpus_eng.txt", "r", encoding="utf8") as file:
    corpus = file.read()

In [22]:
def split_into_sentences(text):
    text = text.replace("!!!","!<stop>")
    text = text.replace("?!","!<stop>")
    text = text.replace(". ",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("\n"," <stop>")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    return sentences

In [29]:
sentences = split_into_sentences(corpus)

In [35]:
sentences_with_break = []
for sentence in sentences:
    words = sentence.split(" ")
    for word in words:
        if word == "break":
            sentences_with_break.append(sentence)

In [37]:
len(sentences_with_break)

352

In [80]:
# определения и примеры (нумерация с 1)
word = 'break'
num = 1
for synset in wn.synsets(word):
    print(str(num) + " " + word + ' - ' + synset.definition() + ' - ' + ' | '.join(synset.examples()))
    num += 1

1 break - some abrupt occurrence that interrupts an ongoing activity - the telephone is an annoying interruption | there was a break in the action when a player was hurt
2 break - an unexpected piece of good luck - he finally got his big break
3 break - (geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other - they built it right over a geological fault | he studied the faulting of the earth's crust
4 break - a personal or social separation (as between opposing factions) - they hoped to avoid a break in relations
5 break - a pause from doing something (as work) - we took a 10-minute break | he took time out to recuperate
6 break - the act of breaking something - the breakage was unavoidable
7 break - a time interval during which there is a temporary cessation of something - 
8 break - breaking of hard tissue such as bone - it was a nasty fracture | the break seems to have been caused by a fall
9 break - the occurrence of breaking - t

In [68]:
from random import sample
chosen_sentences = sample(sentences_with_break, k = 10)
my_sentences = []
for sentence in chosen_sentences:
    sentence = sentence.split(" ")
    my_sentences.append(sentence)

In [73]:
for sentence in my_sentences:
    print(" ".join(sentence))
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming.

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car.

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired.

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel.

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Motown, but the imagery o

In [84]:
# пропишу ручками, какое значение в каждом предложении
my_answer = [33, 25, 5, 5, 37, 22, 13, 5, 8, 4]

### Попытка 1. При таком алгоритме Леска (пересечение предложений и определений) результат 0/10, потому что в определениях нет слов, которые встречаются в предложениях.

In [254]:
# версия с объединением дефиниций и предложений
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    for i, synset in enumerate(wn.synsets(word)):
        definition = tokenize(synset.definition())
        definition = set(definition)
        sentence = set(sentence)
        overlap = len(definition & sentence)
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i + 1 # так как я размечала по смыслам с нумерацией с 1
    return bestsense

In [255]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 52 33

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 52 25

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 7 5

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 52 5

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Moto

### Попытка 2. При пересечении предложений и примеров результат 2/10, хотя в примерах есть полное пересечение фразы commercial break, которое не попало в удачные - очень странно

In [256]:
# версия с объединением дефиниций и предложений
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    for i, synset in enumerate(wn.synsets(word)):
        examples = tokenize(" ".join(synset.examples()))
        examples = set(examples)
        sentence = set(sentence)
        overlap = len(examples & sentence)
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i + 1
    return bestsense

In [257]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 31 33

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 1 25

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 5 5

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 1 5

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Motown

### Попытка 3. Пробую нормализовать все слова в предложении и примерах, а потом применить версию Леска с пересечением предложений и примеров - результат 2/10.  При нормализации выкидываю стоп-слова. Как оказалось, функция нормализации, которую я брала из ноутбука-примера, была для русского языка - то есть нет различий с предыдущей попыткой. Ниже пробую с нормализацией для английского.

In [258]:
# версия с объединением примеров и предложений
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    for i, synset in enumerate(wn.synsets(word)):
        examples = normalize(" ".join(synset.examples()))
        examples = set(examples)
        sentence = set(normalize(" ".join(sentence)))
        overlap = len(examples & sentence)
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i + 1
    return bestsense

In [259]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 31 33

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 1 25

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 5 5

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 1 5

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Motown

### Попытка 4. Пересеку предложения с определениями и примерами, нормализовав слова - результат 2/10. Нормализация все так же на русском - мой промах, который я заметила только потом.

In [260]:
# версия с объединением примеров, определений и предложений
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    for i, synset in enumerate(wn.synsets(word)):
        definition = normalize(synset.definition())
        definition = set(definition)
        examples = normalize(" ".join(synset.examples()))
        examples = set(examples)
        sentence = set(normalize(" ".join(sentence)))
        overlap = len(sentence & (examples | definition))
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i + 1
    return bestsense

In [261]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 31 33

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 52 25

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 5 5

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 1 5

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Motow

### Попытка 5. Надо посмотреть по окну размера 3 в предложениях и примерах. Результат 1/10 (но тот, который раньше неправильно определялся) - в окне совпал контекст break up

In [262]:
def get_words_in_context(sentence, window):
    words_in_context = []
    if type(sentence) == list:
        words = sentence
    else:
        words = sentence.split(" ")
    for index, word in enumerate(words):
        if word == "break":
            words_in_context.append(word)
            for word in words[max(0, index - window): index] + words[index + 1: index + window + 1]:
                words_in_context.append(word)
    return words_in_context

In [263]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    for i, synset in enumerate(wn.synsets(word)):
        examples = " ".join(synset.examples()).split()
        examples_context = set(get_words_in_context(examples, 3))
        sentence_context = set(get_words_in_context(sentence, 3))
        overlap = len(sentence_context & examples_context)
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i + 1
    return bestsense

In [264]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 55 33

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 1 25

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 4 5

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 14 5

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Motow

### Попытка 6. Окно размера 4 - результат 1/10. Все тот же break up из примера.

In [265]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    for i, synset in enumerate(wn.synsets(word)):
        examples = " ".join(synset.examples())
        examples_context = set(get_words_in_context(examples, 4))
        sentence_context = set(get_words_in_context(sentence, 4))
        overlap = len(sentence_context & examples_context)
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i + 1
    return bestsense

In [266]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 55 33

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 1 25

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 4 5

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 14 5

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Motow

### Попытка 7. Не буду в нормализации выкидывать стоп-слова (я все еще не заметила, что нормализатор для русского). Окно размера 4. Результат 1/10

In [300]:
# поправлю нормализацию - не буду выкидывать стоп-слова
def normalise(words):
    words = [morph.parse(word)[0].normal_form for word in words]

    return words

In [336]:
morph.parse('finished')

[Parse(word='finished', tag=OpencorporaTag('LATN'), normal_form='finished', score=1.0, methods_stack=((<LatinAnalyzer>, 'finished'),))]

In [343]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    for i, synset in enumerate(wn.synsets(word)):
        examples = " ".join(synset.examples())
        examples_context = set(normalise(get_words_in_context(examples, 4)))
        sentence_context = set(normalise(get_words_in_context(sentence, 4)))
        overlap = len(sentence_context & examples_context)
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i + 1
    return bestsense

In [344]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 55 33

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 1 25

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 4 5

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 14 5

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Motow

### Попытка 8. Вариант с коэффициентами. Уже используется нормализатор для английского (чуть ниже написан). Результат 1/10

In [294]:
from collections import Counter

In [361]:
word_counter = Counter()
for el in wn.synsets('break'):
    word_counter += Counter(normaliser(el.definition().split()))
    for ex in el.examples():
        word_counter += Counter(normaliser(ex.split()))
#word_counter['break'] = 0
#word_counter += Counter()

In [362]:
word_counter['break']

84

In [363]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    overlap = 0
    for i, synset in enumerate(wn.synsets(word)):
        overlap = 0
        examples = " ".join(synset.examples())
        examples_context = set(normalise(get_words_in_context(examples, 4)))
        
        sentence_context = set(normalise(get_words_in_context(sentence, 4)))
        
        for el in sentence_context:
            if el != 'break':
                if el in examples_context:
                    try:
                        if word_counter[el] > 1:
                            overlap += 1 / word_counter[el]
                    except:
                        print(el)
        # overlap = len(sentence_context & examples_context)
        
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i + 1
    return bestsense

In [364]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 55 33

The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 5 25

When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 4 5

Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 14 5

Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is classic Motow

### Вот он, нормализатор для английского!

In [367]:
import spacy
nlp = spacy.load('en')

In [368]:
def normaliser(words):
    words = [token.lemma_ for token in nlp(" ".join(words))]
    return words

In [398]:
normaliser(["finished,", "minutes", '23'])

['finish', ',', 'minute', '23']

In [399]:
word_counter

Counter({'some': 2,
         'abrupt': 2,
         'occurrence': 2,
         'that': 5,
         'interrupt': 6,
         'an': 11,
         'ongoing': 1,
         'activity': 2,
         'the': 116,
         'telephone': 1,
         'be': 34,
         'annoying': 1,
         'interruption': 1,
         'there': 4,
         'a': 51,
         'break': 84,
         'in': 37,
         'action': 3,
         'when': 5,
         'player': 1,
         'hurt': 1,
         'unexpected': 1,
         'piece': 5,
         'of': 35,
         'good': 1,
         'luck': 1,
         '-PRON-': 68,
         'finally': 8,
         'get': 1,
         'big': 1,
         '(': 6,
         'geology': 1,
         ')': 6,
         'crack': 4,
         'earth': 2,
         "'s": 6,
         'crust': 2,
         'result': 1,
         'from': 11,
         'displacement': 1,
         'one': 3,
         'side': 1,
         'with': 4,
         'respect': 1,
         'to': 30,
         'other': 1,
         'build': 1

### Попытка 9. Нормализация для английского, окно по предложениям размера 6, а для пересечения - примеры и определения. Результат - 2/10, наконец-то определился правильно пример с commercial break.

In [456]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    overlap = 0
    secmax = 0
    secmaxi = 0
    for i, synset in enumerate(wn.synsets(word)):
        overlap = 0
        
        examples = " ".join(synset.examples())
        #examples_context = set(normaliser(get_words_in_context(examples, 4)))
        
        examples_context = set(normaliser(examples.split()) + normaliser(synset.definition().split()))
        sentence_context = set(normaliser(get_words_in_context(sentence, 6)))

        for el in sentence_context:
            if el != 'break' and el != '-PRON-':
                if el in examples_context:
                    try:
                        if word_counter[el] > 0 :
                            overlap += 1 / word_counter[el]
                          #  print(el, end=' ')
                    except:
                        print(el)

    
        if overlap > maxoverlap:
            secmax = overlap
            secmaxi = bestsense
            
            maxoverlap = overlap
            bestsense = i + 1
            
        elif overlap > secmax:
            secmax = overlap
            secmaxi = i + 1
            
    print(secmaxi)
            
    return bestsense

In [457]:
index = 0
for sentence in my_sentences:
    print(" ".join(sentence), lesk("break", sentence), my_answer[index])
    index += 1
    print()

31
Sometimes the old Dell computers used by the officers would break down, and I'd just have to sit there, on a metal chair, staring at George Bush and Dick Cheney's pictures hanging on the wall and fuming. 54 33

5
The race was also delayed on the 21st stage for almost a half hour when Eric Camilli rolled his Ford onto its roof, causing a fire to break out in the car. 8 25

2
When that set amount of time is over, take a 20-30 minute break to keep your mind from getting tired. 31 5

10
Win a fabulous UK break for two at Cliveden House Hotel Brought to you by 7 November 2016 • 7:30am Enter the prize draw before Friday 2 December and you and a friend could soon be enjoying a luxurious break at Cliveden House Hotel. 12 5

5
Of course, if you remember "Bizarre Love Triangle" for one thing, it's that wallop of a chorus, hopping up and down the octave: "Every time I see you falling / I get down on my knees and pray." The quasi-religious wording gives it echoes of Al Green, and the melody is 

### Итого в разных комбинациях были правильно определены предложения 3, 7, 8 и 10. Странно, что ни разу не определилось правильно предложение 6 - там есть break promises, что упоминается в определении.