## Choose a set of words

In [1]:
from nltk.corpus import wordnet

words = ['laptop', 'laptops', 'school', 'student', 'NLP', 'going', 'studying', 'reading', 'book', 'books', 'programming',
        'linguistics', 'maths', 'enjoy', 'experiment', 'learning', 'psychology', 'statistics', 'model', 'academic', 'AI',
        'research', 'computer', 'mechanics', 'robot', 'NASA', 'philosophy', 'agent', 'strategy', 'computation', 'logic']

## For each synset of each word find the corresponding Wikipedia Page mapping and print it

### Get word context

In [2]:
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))


def lists_to_list(lists):
    return list(itertools.chain(*lists))


def parse_sentence(sentence):
    return [word.lower() for word in word_tokenize(sentence) 
            if word.lower() not in stop_words and word[0] not in string.punctuation]

In [3]:
def synonyms(synset):
    return synset.lemma_names()


def hypernyms(synset):
    return lists_to_list([synonyms(hypernym) for hypernym in synset.hypernyms()])


def hyponyms(synset):
    return lists_to_list([synonyms(hyponym) for hyponym in synset.hyponyms()])


def sisters(synset):
    hypernyms = synset.hypernyms()
    return [] if len(hypernyms) == 0 else \
        lists_to_list([synonyms(sister) for sister in hypernyms[0].hyponyms() if sister != synset])


def gloss(synset):
    return parse_sentence(synset.definition())

### Get wikipedia info

In [4]:
import requests

r_session = requests.Session()

URL = 'https://en.wikipedia.org/w/api.php'
PARAMS = {
    'action': 'query',
    'titles': '',
    'prop': '',
    'format': 'json'
}

In [5]:
def wiki_ctx(word, feature):
    PARAMS['titles'] = word
    PARAMS['prop'] = feature    # 'redirects', 'links', 'categories'

    query_response = r_session.get(url=URL, params=PARAMS)
    json_data = query_response.json()

    wikipedia_pages = list(json_data['query']['pages'].values())[0]

    if feature in wikipedia_pages.keys():
        return wikipedia_pages[feature]
    return [wikipedia_pages]

In [6]:
def parse_titles(features):
    return list(set(lists_to_list([parse_sentence(sentence['title']) for sentence in features])))

## Mapping

In [7]:
def ctx_s(synset):
    return set(synonyms(synset) + hypernyms(synset) + hyponyms(synset) + sisters(synset) + gloss(synset))


def ctx_w(word):
    return set(parse_titles(wiki_ctx(word, 'links')) + parse_titles(wiki_ctx(word, 'categories')))


def score(synset, word):
    ctxs = ctx_s(synset)
    ctxw = ctx_w(word)

    return len([0 for word in ctxs if word in ctxw]) + 1

In [8]:
def print_word(word, synset, page):
    f = open('output.txt', 'a')
    f.write(f'{word}: {synset.definition()} - https://en.wikipedia.org/wiki/{page}\n')
    f.close()

In [9]:
def step1(ctx, words):
    for word in words:
        synsets = wordnet.synsets(word)
        wikipedia_pages = wiki_ctx(word, 'redirects')
        if len(synsets) == 1 and len(wikipedia_pages) == 1:
            page = '_'.join(wikipedia_pages[0]['title'].split())
            ctx[word] = [(synsets[0], page)]

In [10]:
def step2(ctx, word, wikipedia_pages):
    for redirect in wikipedia_pages:
        for synsets in ctx.values():
            for synset, page in synsets:
                try:
                    if page == redirect['title'] and synset in wordnet.synsets(word):
                        page = '_'.join(page.split())
                        ctx[word] = [(synset, page)]
                        print_word(word, synset, page)
                        return True
                except KeyError:
                    continue

In [11]:
def step3(ctx, word, synsets, wikipedia_pages):
    ctx[word] = []
    for synset in synsets:
        p_max = (0, '', '')
        for page in wikipedia_pages:
            p = score(synset, page['title'])
            if p > p_max[0]:
                p_max = (p, page['title'])

        page = '_'.join(p_max[1].split())
        ctx[word].append((synset, page))
        print_word(word, synset, page)

In [12]:
def mapping(words):
    ctx = {}
    step1(ctx, words)

    for word in words:
        if word in ctx.keys():
            print_word(word, ctx[word][0][0], ctx[word][0][1])
            continue

        synsets = wordnet.synsets(word)
        wikipedia_pages = wiki_ctx(word, 'redirects')

        if step2(ctx, word, wikipedia_pages):
            continue

        step3(ctx, word, synsets, wikipedia_pages)

    return ctx


ctx = mapping(words)