In [1]:
import nltk, re, requests
from urllib import request
from urllib.request import urlopen
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import pymorphy2

In [2]:
urls = ["https://ru.wikiquote.org/wiki/Русские_пословицы", "https://en.wikipedia.org/wiki/List_of_proverbial_phrases"]
list_of_proverbs = ''

for url in urls:
    response = requests.get(url).content
    soup = BeautifulSoup(response, "html.parser")

    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    list_of_proverbs += text + '\n'

In [3]:
def parse_headline(headline):
    stop_words_ru = stopwords.words("russian")
    stop_words_en = stopwords.words("english")
    pymorph = pymorphy2.MorphAnalyzer()
    ps = PorterStemmer()
    words = []

    for word in headline.split(u' '):
        if not re.match(u'([^а-я¨e]+)', word) and word not in stop_words_ru:
            word = pymorph.parse(word)[0].normal_form
            words.append(word)
        if not re.match(u'([^a-z]+)', word) and word not in stop_words_en:
            word = ps.stem(word)
            words.append(word)
    return words

In [4]:
def get_proverb(word):
    pymorph = pymorphy2.MorphAnalyzer()
    word = pymorph.parse(word)[0].normal_form
    proverbs = []
    lines = (line.strip() for line in list_of_proverbs.splitlines())

    for line in lines:
        word = pymorph.parse(word)[0].normal_form
        if word in line:    
            proverbs.append(line)
        
    return(proverbs)

In [5]:
def proverbs(headline):
    proverbs_list = []
    chunked_headline = parse_headline(headline)
    for c in chunked_headline:
        proverbs = get_proverb(c)
        for p in range(len(proverbs)):
            proverbs[p] = re.sub(r'\xa0—', ' -', proverbs[p])
            proverbs[p] = re.sub(r'\[.*\]', '', proverbs[p]) 
            proverbs_list.append(proverbs[p])
    return proverbs_list

In [6]:
headline = 'Появилось видео задержания устроившего стрельбу под Воронежем солдата'
for item in proverbs(headline):
    print(item)

Бояться пульки - нейти в солдаты.
Где коза прошла, там и солдат пройдёт.
Дурак любит красно, солдат любит ясно.
Знай, солдат, честь: погрелся, да и вон!
Когда солдат палки не боится, ни в строй, ни в дело не годится.
Один солдат - не полк.
Поп попа родит, солдат солдата.


In [7]:
headline = "Biden tells Americans: I implore you, wear a mask"
for item in proverbs(headline):
    print(item)

Dead men tell no tales
Every picture tells a story
Never tell tales out of school
Tell me who your friends are, and I'll tell you who you are
(Only) time will tell
You can never/never can tell
Laugh and the world laughs with you, weep and you weep alone
Better wear out than rust out.
If the shoe fits, wear it
The cobbler always wears the worst shoes
There's no need to wear a hair shirt
Uneasy lies the head that wears a crown
^ "Definition of uneasy lies the head that wears a crown | Dictionary.com". www.dictionary.com. Retrieved 28 December 2019.
