In [1]:
import pymorphy2
import nltk
import ujson as json
import matplotlib.pyplot as plt
import numpy as np
import itertools
import gzip
from nltk.stem.snowball import RussianStemmer
from datetime import datetime
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import ldamulticore
from gensim.models import LdaModel
from scipy.spatial.distance import cosine
from gensim.models import word2vec

from collections import Counter,OrderedDict

In [2]:
class Document:
    def __init__(self, init_dict):
        self.title = init_dict.get('title', '')
        self.description = init_dict.get('description', '')
        self.url = init_dict.get('url', '')
        self.site = init_dict.get('site', '')
        self.ts = datetime.fromtimestamp(init_dict['ts']) if 'ts' in init_dict else -1
    
    def __str__(self):
        res = ''
        res += 'url : %s\n' % self.url
        res += 'date : %s\n' % self.ts
        res += 'title : %s\n' % self.title
        res += 'description : %s\n' % self.description
        res += 'site : %s\n' % self.site
        return res

In [3]:
fin = gzip.open('dataset_mai.jsonl.gz')
for line in itertools.islice(fin, 10):
    data = json.loads(line.strip())
    print(Document(data))

url : http://bloknot-volzhsky.ru/news/volzhane-mogut-podat-zayavlenie-na-letnie-putevki-
date : 2019-11-30 18:26:10
title : –í–æ–ª–∂–∞–Ω–µ –º–æ–≥—É—Ç –ø–æ–¥–∞—Ç—å –∑–∞—è–≤–ª–µ–Ω–∏–µ –Ω–∞ –ª–µ—Ç–Ω–∏–µ –ø—É—Ç–µ–≤–∫–∏ –¥–ª—è –¥–µ—Ç–µ–π
description : –° –ø–æ–Ω–µ–¥–µ–ª—å–Ω–∏–∫–∞ –∑–∞—è–≤–ª–µ–Ω–∏—è –Ω–∞—á–∏–Ω–∞—é—Ç –ø—Ä–∏–Ω–∏–º–∞—Ç—å –≤ –ú–§–¶
site : bloknot-volzhsky.ru

url : https://trikky.ru/test-na-znanie-russkogo-yazyka-423354.html
date : 2019-11-30 18:26:48
title : üíó–¢–µ—Å—Ç –Ω–∞ –∑–Ω–∞–Ω–∏–µ —Ä—É—Å—Å–∫–æ–≥–æ —è–∑—ã–∫–∞üíó
description : –¢–µ—Å—Ç —Å–æ —Å–ª–æ–∂–Ω—ã–º–∏ –∏ –ª–µ–≥–∫–∏–º–∏ –≤–æ–ø—Ä–æ—Å–∞–º–∏. –î–ª—è –∫–æ–≥–æ-—Ç–æ –±—É–¥–µ—Ç –ª–µ–≥–∫–æ –Ω–∞–±—Ä–∞—Ç—å –≤—Å–µ 100 –±–∞–ª–ª–æ–≤, –∞ –∫–æ–º—É-—Ç–æ –±—É–¥–µ—Ç –Ω–µ–º–Ω–æ–≥–æ —Ç—è–∂–µ–ª–æ. –í –ª—é–±–æ–º —Å–ª—É—á–∞–µ –ø–æ–ø—Ä–æ–±–æ–≤–∞—Ç—å —Å—Ç–æ–∏—Ç.1. –ß—Ç–æ –∏–∑—É—á–∞–µ—Ç —Ñ—Ä–∞–∑–µ–æ–ª–æ–≥–∏—è? —Å–ø–æ—Å–æ–±—ã –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è —Å–ª–æ–≤ —É—Å—Ç–æ–π—á–∏–≤—ã–µ —Å–æ—á–µ—Ç–∞–Ω–∏—è —Å–ª–æ–≤ —á–∞—Å—Ç–∏ —Ä–µ—á–∏2. –ù–∞ –º–µ—Å—Ç–µ 

In [4]:
fin = gzip.open('dataset_mai.jsonl.gz')
dataset = []
dataset_test = []
for line in itertools.islice(fin, 10000):
    data = json.loads(line.strip())
    dataset.append(Document(data))

In [5]:
morth_analyzer = pymorphy2.MorphAnalyzer()

In [6]:
def split_words_v3(a_text):
    cur_word = ''
    prev_is_alpha = False

    for letter in a_text:
        if  (letter.isalpha() and prev_is_alpha or 
            letter.isdigit() and not prev_is_alpha):
            cur_word += letter
        elif (letter.isalpha() and not prev_is_alpha or
             letter.isdigit() and prev_is_alpha):
            if cur_word: yield cur_word
            cur_word = letter
            prev_is_alpha = not prev_is_alpha
        else:
            if cur_word: yield cur_word
            cur_word = ''
            prev_is_alpha = False
    if cur_word: yield cur_word
         


In [7]:
def get_norm_word_v3(a_word):
    MORTH_CACHE = {}
    if a_word not in MORTH_CACHE: MORTH_CACHE[a_word] = morth_analyzer.parse(a_word)[0].normal_form
    return MORTH_CACHE[a_word]

In [8]:
def get_doc_words(a_doc, a_split=split_words_v3, a_norm_word=get_norm_word_v3):
    for word in itertools.chain(a_split(a_doc.title), a_split(a_doc.description)):
        yield a_norm_word(word)

In [9]:
word2id = {}
for item in dataset:
    item.words = list(get_doc_words(item))
    for word in item.words:
        if word not in word2id: word2id[word] = len(word2id)

In [10]:
def get_vec_doc(a_doc,a_common_dictionary):
    indexs_word = a_common_dictionary.doc2idx(a_doc.words)
    result_array = np.zeros(len(a_common_dictionary))
    for i in indexs_word:
        result_array[i] += 1
    return result_array

def get_vec_w2v(a_doc,a_w2v):
    res_vec = np.zeros(len(a_w2v['bmw']))
    for word in a_doc.words:
        if word in a_w2v:
            res_vec =res_vec+a_w2v[word].copy()
    return res_vec/len(a_doc.words)

def interprecent_count(l_doc,r_doc):
    count  = 0
    res = []
    for word  in l_doc.values():
        if word in r_doc.values():
            res.append(word)
            count+=1
    return count - abs(len(l_doc) - len(r_doc))

## –î–ó - —Ä–µ–∞–ª–∏–∑–æ–≤–∞—Ç—å –ø–æ–∏—Å–∫ –ø–æ—Ö–æ–∂–∏—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–∫—Å—Ç–æ–≤—ã–º –≤–µ–∫—Ç–æ—Ä–∞–º –∏ –ø–æ word2vec –≤–µ–∫—Ç–æ—Ä–∞–º


In [11]:
import operator
def get_word_match_most_similar_docs(a_doc, a_dataset, a_top_n=10):
    print('get word match most similar docs')
    a_doc_dict= Dictionary([a_doc.words])
    a_doc_index = a_dataset.index(a_doc)
    res = {}
    print(f'original: \n{a_doc}')
    for i, doc in enumerate(a_dataset):
        if i == a_doc_index:
            continue
        doc_dict = Dictionary([doc.words])
        res[i] = interprecent_count(a_doc_dict,doc_dict)    
    res =  sorted(res.items(), key=operator.itemgetter(1),reverse=True)
    print('result doc:')
    for index, c in res[:a_top_n]:
        print(dataset[index])
        
def get_tf_idf_most_similar_doc(a_doc, a_dataset, a_top_n=10):
    # –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–∞ —Å—Ç—Ä–æ–∏—Ç—Å—è –≤–µ–∫—Ç–æ—Ä —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç–∏ —Å–ª–æ–≤–∞—Ä—è (–∞–Ω–∞–ª–æ–≥–∏—á–Ω–æ random forest) –∏ –¥–æ–±–∞–≤–ª—è–µ—Ç—Å—è idf
    # –¥–∞–ª–µ–µ cosine –º–µ–∂–¥—É –¥–æ–∫—É–º–µ–Ω—Ç–∞–º–∏
    print('get tf idf most similar doc')
    print(f'original: \n{a_doc}')
    common_dictionary = Dictionary(item.words for item in a_dataset)
    a_doc_index = a_dataset.index(a_doc)
    a_doc_vec = get_vec_doc(a_doc=a_doc,a_common_dictionary=common_dictionary)
    cosin_res = {}    
    for i, doc in enumerate(a_dataset):
        if i == a_doc_index:
            continue
        doc_vec_sim = get_vec_doc(doc,common_dictionary)
        cos_prom = cosine(a_doc_vec,doc_vec_sim)
        cosin_res[i] = cos_prom
    cosin_res = sorted(cosin_res.items(), key=operator.itemgetter(1))
    print('result:')
    for index, c in cosin_res[:a_top_n]:
        print(dataset[index])

        
def get_w2v_most_similar_doc(a_doc, a_dataset, a_top_n=10):
    # —Å—á–∏—Ç–∞–µ—Ç—Å—è —Å—Ä–µ–¥–Ω–∏–π –≤–µ–∫—Ç–æ—Ä –ø–æ –≤—Å–µ–º —Å–ª–æ–≤–∞–º (–º–æ–∂–Ω–æ –ø—Ä–∏ —É—Å—Ä–µ–¥–Ω–µ–Ω–∏–∏ —É—á–∏—Ç—ã–≤–∞—Ç—å idf)
    print('get w2v most similar doc')
    print(f'original: \n{a_doc}')
    w2v = word2vec.Word2Vec([item.words for item in a_dataset], workers=4)
    a_doc_vec = get_vec_w2v(a_doc,w2v)
    a_doc_index = a_dataset.index(a_doc)
    cosin_res = {}
    for i, doc in enumerate(a_dataset):
        if i == a_doc_index:
            continue 
        doc_vec_sim = get_vec_w2v(doc,w2v)
        cos_prom = cosine(a_doc_vec,doc_vec_sim)
        cosin_res[i] = cos_prom
    cosin_res = sorted(cosin_res.items(), key=operator.itemgetter(1))
    print('result:')
    for index, c in cosin_res[:a_top_n]:
        print(dataset[index])

## –ü—Ä–∏–º–µ—Ä —Ñ–æ—Ä–º–∞—Ç–∞ –≤—ã–¥–∞—á–∏

#### –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ

In [12]:
doc_id = 13 #dota2
doc_id = 1946 #–≥–æ—Ä–æ—Å–∫–æ–ø
doc_id = 3388 #—Ö–æ–∫–∫–µ–π
doc_id = 7601 #—Ç–µ–ª–µ—Ñ–æ–Ω—ã

In [13]:
doc_id = 13 
for similart_func in [get_word_match_most_similar_docs,  get_tf_idf_most_similar_doc, get_w2v_most_similar_doc]:
    similart_func(dataset[doc_id], dataset)

get word match most similar docs
original: 
url : https://cyber.sports.ru/dota2/1080755627.html
date : 2019-11-30 18:26:39
title : –†–µ–∑—É–ª—å—Ç–∞—Ç—ã Parimatch League Dota 2. Virtus.pro –ø–æ–±–µ–¥–∏–ª–∞
description : 30 –Ω–æ—è–±—Ä—è –∑–∞–≤–µ—Ä—à–∏–ª—Å—è —Ç—É—Ä–Ω–∏—Ä¬†Parimatch League. –í —Ñ–∏–Ω–∞–ª–µ Virtus.pro —Ä–∞–∑–≥—Ä–æ–º–∏–ª–∞¬†HellRaisers¬†—Å–æ —Å—á–µ—Ç–æ–º 3:0 –∏ –∑–∞—Ä–∞–±–æ—Ç–∞–ª–∞ 40 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –õ–∞–Ω-—Ñ–∏–Ω–∞–ª Parimatch League –ø—Ä–æ—à–µ–ª —Å 28 –ø–æ 30 –Ω–æ—è–±—Ä—è –≤ –ú–æ—Å–∫–≤–µ. 4 –∫–æ–º–∞–Ω–¥—ã —Ä–∞–∑—ã–≥—Ä–∞–ª–∏ 70 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫–æ–º–∞–Ω–¥ 1. Virtus.pro2.
site : cyber.sports.ru

result doc:
url : https://cyber.sports.ru/dota2/1080757051.html
date : 2019-11-30 18:37:09
title : –ü—Ä–∏–∑–æ–≤–æ–π —Ñ–æ–Ω–¥ Parimatch League Dota 2
description : 30 –Ω–æ—è–±—Ä—è –≤ –ú–æ—Å–∫–≤–µ –∑–∞–≤–µ—Ä—à–∏–ª—Å—è —Ç—É—Ä–Ω–∏—Ä¬†Parimatch League. –ü—Ä–∏–∑–æ–≤–æ–π —Ñ–æ–Ω–¥ —Ç—É—Ä–Ω–∏—Ä–∞ ‚Äì 70 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –í —Ñ–∏–Ω–∞–ª

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  dist = 1.0 - uv / np.sqrt(uu * vv)


result:
url : https://cyber.sports.ru/dota2/1080757051.html
date : 2019-11-30 18:37:09
title : –ü—Ä–∏–∑–æ–≤–æ–π —Ñ–æ–Ω–¥ Parimatch League Dota 2
description : 30 –Ω–æ—è–±—Ä—è –≤ –ú–æ—Å–∫–≤–µ –∑–∞–≤–µ—Ä—à–∏–ª—Å—è —Ç—É—Ä–Ω–∏—Ä¬†Parimatch League. –ü—Ä–∏–∑–æ–≤–æ–π —Ñ–æ–Ω–¥ —Ç—É—Ä–Ω–∏—Ä–∞ ‚Äì 70 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –í —Ñ–∏–Ω–∞–ª–µ¬†Virtus.pro¬†—Ä–∞–∑–≥—Ä–æ–º–∏–ª–∞¬†HellRaisers¬†—Å–æ —Å—á–µ—Ç–æ–º 3:0 –∏ –∑–∞—Ä–∞–±–æ—Ç–∞–ª–∞ 40 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø—Ä–∏–∑–æ–≤—ã—Ö 1. Virtus.pro¬†‚Äì $40 0002. HellRaisers ‚Äì $20 0003.
site : cyber.sports.ru

url : https://cyber.sports.ru/dota2/1080752587.html
date : 2019-11-30 16:57:20
title : Virtus.pro –≤–µ–¥–µ—Ç —Å–æ —Å—á–µ—Ç–æ–º 2:0 –ø–æ –∫–∞—Ä—Ç–∞–º¬†–≤ —Ñ–∏–Ω–∞–ª–µ –ª–∏–≥–∏ Parimatch
description : Virtus.pro —Ä–∞–∑–≥—Ä–æ–º–∏–ª–∞ HellRaisers –Ω–∞ –≤—Ç–æ—Ä–æ–π –∫–∞—Ä—Ç–µ –≥—Ä–∞–Ω–¥-—Ñ–∏–Ω–∞–ª–∞ Parimatch League. –ú–∞—Ç—á –∑–∞–∫–æ–Ω—á–∏–ª—Å—è —Å–æ —Å—á–µ—Ç–æ–º 29:23. Virtus.pro –≤–µ–¥–µ—Ç —Å–æ —Å—á–µ—Ç–æ–º 2-0. 

In [14]:
doc_id = 1946 
for similart_func in [get_word_match_most_similar_docs,  get_tf_idf_most_similar_doc, get_w2v_most_similar_doc]:
    similart_func(dataset[doc_id], dataset)

get word match most similar docs
original: 
url : https://www.obozrevatel.com/astro/news/goroskop-na-1-dekabrya-chto-zhdet-lvov-rakov-dev-i-drugie-znaki-zodiaka.htm
date : 2019-11-30 17:07:11
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 1 –¥–µ–∫–∞–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –õ—å–≤–æ–≤, –†–∞–∫–æ–≤, –î–µ–≤ –∏ –¥—Ä—É–≥–∏–µ –∑–Ω–∞–∫–∏ –∑–æ–¥–∏–∞–∫–∞
description : –°–æ–≤–µ—Ç—ã –∞—Å—Ç—Ä–æ–ª–æ–≥–æ–≤ –Ω–∞ –≤–æ—Å–∫—Ä–µ—Å–µ–Ω—å–µ
site : obozrevatel.com

result doc:
url : https://prochepetsk.ru/news/19614
date : 2019-11-30 15:02:54
title : –†–æ–º–∞–Ω—Ç–∏—á–Ω—ã–π —É–∂–∏–Ω —É –†–∞–∫–æ–≤ –∏ —à–æ–ø–∏–Ω–≥ —É –í–æ–¥–æ–ª–µ–µ–≤: –≥–æ—Ä–æ—Å–∫–æ–ø –¥–ª—è –≤—Å–µ—Ö –∑–Ω–∞–∫–æ–≤ –∑–æ–¥–∏–∞–∫–∞
description : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 30 –Ω–æ—è–±—Ä—è –∏ 1 –¥–µ–∫–∞–±—Ä—è –¥–ª—è –≤—Å–µ—Ö –∑–Ω–∞–∫–æ–≤ –∑–æ–¥–∏–∞–∫–∞
site : prochepetsk.ru

url : https://www.gs.by/2019/11/30/goroskop-na-2020-god-beloj-krysy-ovnam-i-rakam-nuzhno-byt-nacheku/
date : 2019-11-30 16:41:48
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 2020 –≥–æ–¥ –ë–µ–ª–æ–π –ö—Ä—ã—Å—ã: –û–≤–Ω–

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


result:
url : https://www.obozrevatel.com/show/plemennoj-zherebets-gena-viter-otkrovenno-udivil-poklonnits-na-krasnoj-dorozhke-premii-m1.htm
date : 2019-11-30 18:02:08
title : –ü–ª–µ–º–µ–Ω–Ω–æ–π –∂–µ—Ä–µ–±–µ—Ü: –ì–µ–Ω–∞ –í–∏—Ç–µ—Ä –æ—Ç–∫—Ä–æ–≤–µ–Ω–Ω–æ —É–¥–∏–≤–∏–ª –ø–æ–∫–ª–æ–Ω–Ω–∏—Ü –Ω–∞ –∫—Ä–∞—Å–Ω–æ–π –¥–æ—Ä–æ–∂–∫–µ –ø—Ä–µ–º–∏–∏ –ú1
description : Gena VITER —Ä–µ—à–∏–ª –ø—Ä–∏–≤–ª–µ—á—å –≤–Ω–∏–º–∞–Ω–∏–µ
site : obozrevatel.com

url : https://megaobzor.com/Xiaomi-Mi-Mix-3-5G-i-ego-funkcii.html
date : 2019-11-30 19:26:42
title : Xiaomi Mi Mix 3 5G –∏ –µ–≥–æ —Ñ—É–Ω–∫—Ü–∏–∏
description : –ö–æ–º–ø–∞–Ω–∏—è Xiaomi –≤—ã–ø—É—Å—Ç–∏–ª–∞ —Ñ–ª–∞–≥–º–∞–Ω—Å–∫—É—é –Ω–æ–≤–∏–Ω–∫—É –ø–æ–¥ –Ω–∞–∑–≤–∞–Ω–∏–µ–º Xiaomi Mi Mix 3 5G. –í —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏—Å—Ç–∏–∫–∏ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ –≤—Ö–æ–¥–∏—Ç –æ–ø–µ—Ä–∞—Ç–∏–≤–Ω–∞—è –ø–∞–º—è—Ç—å –Ω–∞ 6 –ì–±–∞–π—Ç, —Ñ–ª—ç—à –Ω–∞–∫–æ–ø–∏—Ç–µ–ª—å –Ω–∞ 64 –ì–±–∞–π—Ç. –¢–∞–∫–∂–µ –Ω–æ–≤–∏–Ω–∫–∞ –∏–º–µ–µ—Ç —ç–∫—Ä–∞–Ω —Å –¥–∏–∞–≥–æ–Ω–∞–ª—å—é 6,39 –¥—é–π–º–∞ –æ–±–ª–∞–¥–∞–µ—Ç —Ä–∞–∑—Ä–µ

In [15]:
doc_id = 3388 
for similart_func in [get_word_match_most_similar_docs,  get_tf_idf_most_similar_doc, get_w2v_most_similar_doc]:
    similart_func(dataset[doc_id], dataset)

get word match most similar docs
original: 
url : https://www.tv21.ru/news/2019/11/30/khokkeisty-murmana-proveli-pervyy-domashniy-match-na-stadione-stroitel
date : 2019-11-30 15:31:47
title : –•–æ–∫–∫–µ–∏—Å—Ç—ã "–ú—É—Ä–º–∞–Ω–∞" –ø—Ä–æ–≤–µ–ª–∏ –ø–µ—Ä–≤—ã–π –¥–æ–º–∞—à–Ω–∏–π –º–∞—Ç—á –Ω–∞ —Å—Ç–∞–¥–∏–æ–Ω–µ "–°—Ç—Ä–æ–∏—Ç–µ–ª—å"
description : –ò–≥—Ä–∞ —Å–æ—Å—Ç–æ—è–ª–∞—Å—å –≤ —Ä–∞–º–∫–∞—Ö –æ—Ç–∫—Ä—ã—Ç–∏—è –Ω–æ–≤–æ–≥–æ —Å–µ–∑–æ–Ω–∞ –°—É–ø–µ—Ä–ª–∏–≥–∏ –ø–æ —Ö–æ–∫–∫–µ—é —Å –º—è—á–æ–º.
site : tv21.ru

result doc:
url : https://severpost.ru/read/87561
date : 2019-11-30 17:32:40
title : –ü–µ—Ä–≤—ã–π –¥–æ–º–∞—à–Ω–∏–π –º–∞—Ç—á ¬´–ú—É—Ä–º–∞–Ω¬ª —Å—ã–≥—Ä–∞–ª –≤–Ω–∏—á—å—é
description : –í —Å–≤–æ–µ–º –ø–µ—Ä–≤–æ–º –¥–æ–º–∞—à–Ω–µ–º –º–∞—Ç—á–µ ¬´–ú—É—Ä–º–∞–Ω¬ª –Ω–µ —Å–º–æ–≥ –æ–¥–æ–ª–µ—Ç—å ¬´–í–æ–ª–≥—É¬ª. –ò–≥—Ä–∞ –ø—Ä–æ—Ö–æ–¥–∏–ª–∞ –Ω–∞ —Å—Ç–∞–¥–∏–æ–Ω–µ ¬´–°—Ç—Ä–æ–∏—Ç–µ–ª—å¬ª. –í –ø–µ—Ä–≤–æ–º —Ç–∞–π–º–µ –º—É—Ä–º–∞–Ω—á–∞–Ω–µ —É—Å—Ç—É–ø–∏–ª–∏ —Ö–æ–∫–∫–µ–∏—Å—Ç–∞–º –∏–∑...
site : severpost.ru

url : https://spo

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


result:
url : https://lenta.ru/news/2019/11/30/citymanchester/
date : 2019-11-30 17:37:16
title : –ì–æ–ª –º–æ—â–Ω—ã–º —É–¥–∞—Ä–æ–º –∏–∑-–∑–∞ —à—Ç—Ä–∞—Ñ–Ω–æ–π –ø–æ–º–æ–≥ ¬´–ú–∞–Ω—á–µ—Å—Ç–µ—Ä –°–∏—Ç–∏¬ª –¥–æ–±—ã—Ç—å –Ω–∏—á—å—é –≤ –∏–≥—Ä–µ –ê–ü–õ
description : ¬´–ú–∞–Ω—á–µ—Å—Ç–µ—Ä –°–∏—Ç–∏¬ª –Ω–∞ –≤—ã–µ–∑–¥–µ —Å—ã–≥—Ä–∞–ª –≤–Ω–∏—á—å—é —Å ¬´–ù—å—é–∫–∞—Å–ª–æ–º¬ª –≤ –º–∞—Ç—á–µ 14-–≥–æ —Ç—É—Ä–∞ –ê–Ω–≥–ª–∏–π—Å–∫–æ–π –ø—Ä–µ–º—å–µ—Ä-–ª–∏–≥–∏ (–ê–ü–õ). –í—Å—Ç—Ä–µ—á–∞ –ø—Ä–æ—à–ª–∞ –≤ —Å—É–±–±–æ—Ç—É, 30 –Ω–æ—è–±—Ä—è, –∏ –∑–∞–≤–µ—Ä—à–∏–ª–∞—Å—å —Å–æ —Å—á–µ—Ç–æ–º 2:2. –í —Å–æ—Å—Ç–∞–≤–µ ¬´–≥–æ—Ä–æ–∂–∞–Ω¬ª –≤ –ø–µ—Ä–≤–æ–º —Ç–∞–π–º–µ –æ—Ç–ª–∏—á–∏–ª—Å—è –†–∞—Ö–∏–º –°—Ç–µ—Ä–ª–∏–Ω–≥. –ï—â–µ –æ–¥–∏–Ω –º—è—á –º–æ—â–Ω—ã–º —É–¥–∞—Ä–æ–º –∏–∑-–∑–∞ –ø—Ä–µ–¥–µ–ª–æ–≤ —à—Ç—Ä–∞—Ñ–Ω–æ–π –∑–∞–±–∏–ª –ö–µ–≤–∏–Ω –¥–µ –ë—Ä–µ–π–Ω–µ.
site : lenta.ru

url : http://terrikon.com/posts/341367
date : 2019-11-30 19:07:42
title : 15-–π —Ç—É—Ä —á–µ–º–ø–∏–æ–Ω–∞—Ç–∞ –§—Ä–∞–Ω—Ü–∏–∏: —Å–º–æ—Ç—Ä–µ—Ç—å –æ–Ω–ª–∞–π–Ω-–≤–∏–¥–µ–æ—Ç—Ä–∞–Ω—Å–ª—è

In [16]:
doc_id = 7601
for similart_func in [get_word_match_most_similar_docs,  get_tf_idf_most_similar_doc, get_w2v_most_similar_doc]:
    similart_func(dataset[doc_id], dataset)

get word match most similar docs
original: 
url : https://megaobzor.com/Stala-izvestna-cena-smartfona-Redmi-K30.html
date : 2019-11-30 12:12:16
title : –°—Ç–∞–ª–∞ –∏–∑–≤–µ—Å—Ç–Ω–∞ —Ü–µ–Ω–∞ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Redmi K30
description : –ê–≤—Ç–æ—Ä–∏—Ç–µ—Ç–Ω—ã–π –∏—Å–∫–∞—Ç–µ–ª—å —É—Ç–µ—á–µ–∫ –ú—É–∫—É–ª –®–∞—Ä–º–∞ –ø–æ–¥–µ–ª–∏–ª—Å—è –ø–æ–¥—Ä–æ–±–Ω–æ—Å—Ç—è–º–∏ –æ —Ü–µ–Ω–µ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Redmi K30, –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∞–Ω–æ–Ω—Å –∫–æ—Ç–æ—Ä–æ–≥–æ —Å–æ—Å—Ç–æ–∏—Ç—Å—è —É–∂–µ 10 –¥–µ–∫–∞–±—Ä—è. –ï—Å–ª–∏ –≤–µ—Ä–∏—Ç—å –∏—Å—Ç–æ—á–Ω–∏–∫—É, –∞–ø–ø–∞—Ä–∞—Ç –æ–±–æ–π–¥–µ—Ç—Å—è –≤ 327 –¥–æ–ª–ª–∞—Ä–æ–≤, —á—Ç–æ –Ω–∞–º–Ω–æ–≥–æ –±–æ–ª—å—à–µ 285 –¥–æ–ª–ª–∞—Ä–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –µ–º—É –ø—Ä–∏–ø–∏—Å—ã–≤–∞–ª–∏ —Ä–∞–Ω–µ–µ. –ü–æ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω—ã–º –¥–∞–Ω–Ω—ã–º, Redmi K30 –ø–æ–ª—É—á–∏—Ç –∞–∫–∫—É–º—É–ª—è—Ç–æ—Ä —ë–º–∫–æ—Å—Ç—å—é 5000 –º–ê—á, –∫–≤–∞–¥—Ä–æ–∫–∞–º–µ—Ä—É —Å –≥–ª–∞–≤–Ω—ã–º –¥–∞—Ç—á–∏–∫–æ–º –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è —Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ–º 60 –ú–ø, –¥–∏—Å–ø–ª–µ–π —Å —á–∞—Å—Ç–æ—Ç–æ–π –æ–±–Ω–æ–≤–ª–µ

result:
url : https://megaobzor.com/Pervii-kachestvennii-render-smartfona-Redmi-K30.html
date : 2019-11-30 10:34:36
title : –ü–µ—Ä–≤—ã–π –∫–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–π —Ä–µ–Ω–¥–µ—Ä —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Redmi K30
description : –°–µ–≥–æ–¥–Ω—è –≤ —Å–µ—Ç—å –±—ã–ª–∞ –æ–ø—É–±–ª–∏–∫–æ–≤–∞–Ω–∞ –ø–µ—Ä–≤–∞—è –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω–∞—è —Ñ–æ—Ç–æ–≥—Ä–∞—Ñ–∏—è —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Xiaomi Redmi K30, –∫–æ—Ç–æ—Ä—ã–π –∞–Ω–æ–Ω—Å–∏—Ä—É—é—Ç 10 –¥–µ–∫–∞–±—Ä—è. –ï—é –ø–æ–¥–µ–ª–∏–ª—Å—è –∞–≤—Ç–æ—Ä–∏—Ç–µ—Ç–Ω—ã–π –∏–Ω—Å–∞–π–¥–µ—Ä –ø–æ–¥ –Ω–∏–∫–æ–º Xiaomishka. –ö–∞–∫ –º—ã –º–æ–∂–µ–º –∑–∞–º–µ—Ç–∏—Ç—å, —Å–ª—É—Ö–∏ –ø—Ä–æ —ç–∫—Ä–∞–Ω —Å –æ–¥–∏–Ω–∞–∫–æ–≤—ã–º–∏ —Ä–∞–º–∫–∞–º–∏ –ø–æ –±–æ–∫–∞–º –∏ –Ω–∞–¥ –¥–∏—Å–ø–ª–µ–µ–º, –∑–∞–º–µ—Ç–Ω—ã–π –ø–æ–¥–±–æ—Ä–æ–¥–æ–∫ –∏ –≤—Ä–µ–∑–∞–Ω–Ω–æ–π –≤ –Ω–µ–≥–æ –¥–≤–æ–π–Ω–æ–π —Ñ—Ä–æ–Ω—Ç–∞–ª—å–Ω–æ–π –∫–∞–º–µ—Ä–æ–π –æ–ø—Ä–∞–≤–¥–∞–ª–∏—Å—å. –ü–æ –æ—Å—Ç–∞–ª—å–Ω—ã–º –¥–∞–Ω–Ω—ã–º —Å–º–∞—Ä—Ç—Ñ–æ–Ω –¥–æ–ª–∂–µ–Ω –æ—Å–Ω–∞—Å—Ç–∏—Ç—å—Å—è 6,66- –¥—é–π–º–æ–≤—ã–º LCD-–¥–∏—Å–ø–ª–µ–µ–º —Å —Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ–º Full HD+, —á

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


result:
url : https://megaobzor.com/Novaja-tehnologija-Honor-V30-Pro.html
date : 2019-11-30 18:26:15
title : –ù–æ–≤–∞—è —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è Honor V30 Pro
description : –ö–æ–º–ø–∞–Ω–∏—è Honor –≤—ã–ø—É—Å—Ç–∏–ª–∞ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—É—é –Ω–æ–≤–∏–Ω–∫—É —Å –æ—Ç–ª–∏—á–Ω–æ–π —Ñ—É–Ω–∫—Ü–∏–µ–π –ø–æ–¥ –Ω–∞–∑–≤–∞–Ω–∏–µ–º V30 Pro. –ê–ø–ø–∞—Ä–∞—Ç –≤–∫–ª—é—á–∞–µ—Ç –≤ —Å–µ–±—è –¥–∏—Å–ø–ª–µ–π –Ω–∞ 6,57 –¥—é–π–º–∞,—Ñ–ª—ç—à –Ω–∞–∫–æ–ø–∏—Ç–µ–ª—å –Ω–∞ 256 –ì–±–∞–π—Ç.–¢–∞–∫–∂–µ –∏–º–µ–µ—Ç—Å—è –æ–±—ä–µ–º –æ–ø–µ—Ä–∞—Ç–∏–≤–Ω–æ–π –ø–∞–º—è—Ç–∏ 8 –ì–±–∞–π—Ç. –°–µ—Ä–¥—Ü–µ–º —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ —è–≤–ª—è–µ—Ç—Å—è —Ñ–∏—Ä–º–µ–Ω–Ω—ã–π –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä Kirin 990. –û —Å—Ç–æ–∏–º–æ—Å—Ç–∏ –Ω–æ–≤–æ–≥–æ –≥–∞–¥–∂–µ—Ç–∞ –ø–æ–∫–∞ –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–æ.
site : megaobzor.com

url : https://rozetked.me/news/8719-marketingovye-izobrazheniya-redmi-k30-s-dvoynoy-selfi-kameroy
date : 2019-11-30 17:03:37
title : –ú–∞—Ä–∫–µ—Ç–∏–Ω–≥–æ–≤—ã–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è Redmi K30 —Å –¥–≤–æ–π–Ω–æ–π —Å–µ–ª—Ñ–∏-–∫–∞–º–µ—Ä–æ–π
description : –í —Å–æ—Ü–