In [1]:
from nltk.corpus import wordnet

In [19]:
def get_synset(word):
    return wordnet.synsets(word)[0]

def get_synset_list(words):
    synsets = []
    for word in words:
        synsets += [get_synset(word)]
    return synsets

# 1. Original Lesk

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))

def process_gloss(gloss: str):
    gloss_list = word_tokenize(gloss)
    gloss_processed = [word.lower() for word in gloss_list 
                       if word.lower() not in stop_words and word[0] not in string.punctuation]
    return gloss_processed

In [4]:
def gloss_score(synset1: str, synset2: str):
    gloss1 = list(set(process_gloss(synset1)))
    gloss2 = list(set(process_gloss(synset2)))
    score = 0

    for word in gloss1:
        if word in gloss2:
            score += 1

    return score

In [5]:
def best_score_glosses(synset1: str, synsets2: [str], gloss_score_func = gloss_score):
    best_score = 0

    for synset2 in synsets2:
        score = gloss_score_func(synset1, synset2)

        if score > best_score:
            best_score = score

    return best_score

In [6]:
from nltk import pos_tag

def process_sentence(sentence):
    sentence_pos = pos_tag(sentence)
    return [wordnet.synsets(word[0].lower(), 'a' if word[1][0].lower() == 'j' else word[1][0].lower()) 
            for word in sentence_pos if word[0].lower() not in stop_words and word[0][0] not in string.punctuation]

In [7]:
def original_lesk(sentence: str, word: str, pos: str):
    target_synsets = wordnet.synsets(word, pos)
    sentence_processed = process_sentence(sentence)

    best_score = (-1, '')
    for synset in target_synsets:
        score = 0

        for word in sentence_processed:
            glosses = [synset.definition() for synset in word]
            score += best_score_glosses(synset.definition(), glosses)

        if score > best_score[0]:
            best_score = (score, synset)

    return best_score

In [8]:
from nltk.wsd import lesk

sentence = word_tokenize('Students enjoy going to school, studying and reading books')

simplified_lesk_sense = lesk(sentence, 'school', 'n')
print('Simplified lesk definition:', simplified_lesk_sense.definition(), sep='\n')

original_lesk_sense = original_lesk(sentence, 'school', 'n')
print('Original lesk definition:', original_lesk_sense[1].definition(), sep='\n')

Simplified lesk definition:
an educational institution's faculty and students
Original lesk definition:
a body of creative artists or writers or thinkers linked by a similar style or by similar teachers


# 2. Extended Lesk

In [9]:
# noun, verb
def hypernyms(synset):
    return synset.hypernyms()

def hyponyms(synset):
    return synset.hyponyms()

# noun
def meronyms(synset):
    return synset.substance_meronyms() + synset.part_meronyms() + synset.member_meronyms()

def holonyms(synset):
    return synset.substance_holonyms() + synset.part_holonyms() + synset.member_holonyms()

# noun, adjective
def attributes(synset):
    return synset.attributes()

# adjective
def similar_tos(synset):
    return synset.similar_tos()

In [10]:
def feature_glosses(synsets, func):
    return [feature.definition() for synset in synsets for feature in func(synset)]

In [11]:
def complex_gloss_score(synset1: str, synset2: str):
    gloss1 = process_gloss(synset1)
    gloss2 = process_gloss(synset2)

    n1 = len(gloss1)
    n2 = len(gloss2)
    score = 0

    for i in range(n1):
        max_len = 0

        for j in range(n2):
            k = 0
            while i + k < n1 and j + k < n2 and gloss1[i + k] == gloss2[j + k]:
                k += 1

            if k > max_len:
                max_len = k

        score += max_len ^ 2
        i += max(0, max_len - 1)

    return score

In [37]:
def compute_complex_score(target_gloss: str, synsets: [], feature_func: []):
    score = 0

    glosses = [synset.definition() for synset in synsets]
    score += best_score_glosses(target_gloss, glosses, complex_gloss_score)

    for func in feature_func:
        glosses = feature_glosses(synsets, func)
        score += best_score_glosses(target_gloss, glosses, complex_gloss_score)

    return score

In [54]:
def extended_lesk(sentence: str, word: str, pos: str, feature_func: []):
    target_synsets = wordnet.synsets(word, pos)
    sentence_processed = process_sentence(sentence)

    best_score = (-1, '')
    for target_synset in target_synsets:
        score = 0
        
        for synsets in sentence_processed:
            score += compute_complex_score(target_synset.definition(), synsets, feature_func)

        if score > best_score[0]:
            best_score = (score, target_synset)

    return best_score

### Print the measure for each pair of synsets with five different sets of relations taken into acount in measuring the score

In [25]:
import itertools

synset_list = get_synset_list(word_tokenize('Students enjoy going school studying reading books'))
combinations = list(itertools.combinations(synset_list, 2))

relations = [
    [hypernyms, hyponyms, meronyms, holonyms, attributes, similar_tos],
    [hypernyms, hyponyms, meronyms, holonyms],
    [hyponyms, meronyms],
    [hypernyms, holonyms],
    [hyponyms, meronyms, similar_tos]
]

In [53]:
from tabulate import tabulate

print_table = []
for synsets in combinations:
    relation_scores = [synsets[0].name() + ' - ' + synsets[1].name()]

    for relation in relations:
        relation_scores.append(compute_complex_score(synsets[0].definition(), [synsets[1]], relation))

    print_table.append(relation_scores)

print(tabulate(print_table, 
               headers=['synsets', 'all', 'hyper+hypo+mero+holo', 'hypo+mero', 'hyper+holo', 'hypo+mero+similar'], 
               tablefmt='fancy_grid'))

╒═══════════════════════════════╤═══════╤════════════════════════╤═════════════╤══════════════╤═════════════════════╕
│ synsets                       │   all │   hyper+hypo+mero+holo │   hypo+mero │   hyper+holo │   hypo+mero+similar │
╞═══════════════════════════════╪═══════╪════════════════════════╪═════════════╪══════════════╪═════════════════════╡
│ student.n.01 - enjoy.v.01     │    16 │                     16 │          16 │            8 │                  16 │
├───────────────────────────────┼───────┼────────────────────────┼─────────────┼──────────────┼─────────────────────┤
│ student.n.01 - departure.n.01 │    24 │                     24 │          16 │           16 │                  16 │
├───────────────────────────────┼───────┼────────────────────────┼─────────────┼──────────────┼─────────────────────┤
│ student.n.01 - school.n.01    │    32 │                     32 │          23 │           16 │                  23 │
├───────────────────────────────┼───────┼───────────────

### Obtain the word sense for the given text and word and print its definition

In [59]:
feature_func = [hypernyms, hyponyms, meronyms, holonyms, attributes, similar_tos]

extended_lesk_sense = extended_lesk(sentence, 'school', 'n', feature_func)
print('Extended lesk definition:', extended_lesk_sense[1].definition(), sep='\n')

Extended lesk definition:
a body of creative artists or writers or thinkers linked by a similar style or by similar teachers
