In [1]:
import requests
from bs4 import BeautifulSoup
import time

In [2]:
url = "https://ru.wiktionary.org/wiki/%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%9B%D0%B8%D0%BD%D0%B3%D0%B2%D0%B8%D1%81%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B5_%D1%82%D0%B5%D1%80%D0%BC%D0%B8%D0%BD%D1%8B/ru"

In [3]:
def get_terms_and_definitions(soup_obj, term_definitions):
    term_links = soup_obj.find_all("div", class_="mw-category-group")[0].find_all("a")

    for link in term_links:
        term_url = "https://ru.wiktionary.org" + link['href']
        term_title = link.get_text()
        term_response = requests.get(term_url)

        if term_response.status_code == 200:
            term_soup = BeautifulSoup(term_response.content, 'html.parser')
            # Предполагается, что первое определение всегда под заголовком "Значение"
            meaning_section = term_soup.find("div", {"class": "mw-heading mw-heading4"})
            meaning_list = meaning_section.find_next("ol")
            if meaning_list:
                meanings = meaning_list.find_all("li")
            if meanings:
                meanings = list(map(lambda x: x.get_text().strip().split('◆')[0], meanings))
                ling_meanings = list(filter(lambda x: 'лингв.' in x, meanings))
                if ling_meanings:
                    ling_meanings = list(map(lambda x: x.split('лингв.')[-1], ling_meanings))
                    meanings = ling_meanings
                definition_text = meanings[0]
                term_definitions[term_title] = definition_text

        # Удержание времени, чтобы избежать перегрузки сервера
        time.sleep(1)

    return term_definitions

In [4]:
def get_next_link(soup_page):
    next_pages = soup_page.find_all("div", id="mw-pages")[0].find_all("a")
    next_pages = list(filter(lambda x: 'Следующая страница' in x.get_text(), next_pages))
    if len(next_pages) > 0 and next_pages[0]['href']:
        return "https://ru.wiktionary.org" + next_pages[0]['href']
    else:
        return 'last page'

In [5]:
def collect_wiki_data(start_url):
    term_definitions = {}
    next_link = start_url

    while next_link != 'last page':
        print('crauling ' + next_link)
        response = requests.get(next_link)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            get_terms_and_definitions(soup, term_definitions)
            next_link = get_next_link(soup)
    return term_definitions

In [6]:
term_definitions = collect_wiki_data(url)

crauling https://ru.wiktionary.org/wiki/%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%9B%D0%B8%D0%BD%D0%B3%D0%B2%D0%B8%D1%81%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B5_%D1%82%D0%B5%D1%80%D0%BC%D0%B8%D0%BD%D1%8B/ru
crauling https://ru.wiktionary.org/w/index.php?title=%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%9B%D0%B8%D0%BD%D0%B3%D0%B2%D0%B8%D1%81%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B5_%D1%82%D0%B5%D1%80%D0%BC%D0%B8%D0%BD%D1%8B/ru&pagefrom=%D0%B0%D1%81%D0%BF%D0%B8%D1%80%D0%B0%D1%82%0A%D0%B0%D1%81%D0%BF%D0%B8%D1%80%D0%B0%D1%82#mw-pages
crauling https://ru.wiktionary.org/w/index.php?title=%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%9B%D0%B8%D0%BD%D0%B3%D0%B2%D0%B8%D1%81%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B5_%D1%82%D0%B5%D1%80%D0%BC%D0%B8%D0%BD%D1%8B/ru&pagefrom=%D0%B3%D0%B8%D0%B1%D1%80%D0%B8%D0%B4%0A%D0%B3%D0%B8%D0%B1%D1%80%D0%B8%D0%B4#mw-pages
crauling https://ru.wiktionary.org/w/index.php?title=%D0%9A%D0%B0%D1%82%D0%B5%

In [7]:
with open('definitions.txt', 'a', encoding='utf-8') as f:
    for k, v in term_definitions.items():
        line = 'term: ' + k + ', definition: ' + v + '\n'
        f.write(line)