In [1]:
from bs4 import BeautifulSoup
import re
import codecs
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import pymorphy2 
from pymystem3 import Mystem

In [2]:
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

### mystem

In [7]:
output_txt = 'title_output_mystem.txt'
total_sites = 28026
path = '../content/'

In [8]:
m = Mystem()

with tqdm_notebook(total=total_sites) as pbar:
    with open(output_txt, 'w') as out_f:
        out_f.write('{}\t{}\n'.format('doc_id', 'title'))
        for doc_id in range(1,total_sites+1):
            with codecs.open(path + str(doc_id) + '.dat', 'r', 'utf-8') as in_f:
                soup = BeautifulSoup(in_f, 'lxml')
                title_all = soup.find_all('title')
                    
                result = [i for i in re.split(r'\W+', str(title_all).lower()) if i]
                result = [re.sub(r'[^ёЁА-я]', '', i) for i in result]
                result = m.lemmatize(' '.join(result))
                result = [u for u in result if len(u) > 2 and u not in russian_stopwords]
                result = re.sub('\s+',' ', ' '.join(result))   
                
                out_f.write('{}\t{}\n'.format(doc_id, result))
                pbar.update()

HBox(children=(IntProgress(value=0, max=28026), HTML(value='')))




In [9]:
df = pd.read_csv(output_txt, sep='\t', encoding='utf-8', lineterminator='\n')
df.head()

Unnamed: 0,doc_id,title
0,1,аншин центр репродукция генетика фертимед москва
1,2,перевод киви кошелек
2,3,проект патруль время реабилитация духовный сущ...
3,4,блог клуб преподавание начальный класс портал ...
4,5,быстро понижать холестерин высокий холестерин ...


### pymorphy2

In [10]:
output_txt = 'title_output_pymorphy2.txt'
total_sites = 28026
path = '../content/'

In [11]:
def pos(word, morth=pymorphy2.MorphAnalyzer()):
    "Return a likely part of speech for the *word*."""
    return morth.parse(word)[0].tag.POS
functors_pos = {'INTJ', 'PRCL', 'CONJ', 'PREP'}  # function words

In [14]:
doc_to_title = {}
with tqdm_notebook(total=total_sites) as pbar:
    with open(output_txt, 'w') as out_f:
        out_f.write('{}\t{}\n'.format('doc_id', 'title'))
        for doc_id in range(1,total_sites+1):
            with open(path + str(doc_id) + ".dat",'r') as in_f:
                soup = BeautifulSoup(in_f, 'html.parser')
                
                title_all = str(soup.find_all('title')).lower()
                result = [i for i in re.split(r'\W+', title_all) if i] 
                result = [re.sub(r'[^ёЁА-я]', '', i) for i in result]
                morth=pymorphy2.MorphAnalyzer()
                result = [morth.parse(word)[0].normal_form  for word in result if \
                          pos(word) not in functors_pos]
                result = [word for word in result if word not in russian_stopwords and len(word)>2]
                result = re.sub('\s+',' ', ' '.join(result))   

                out_f.write('{}\t{}\n'.format(doc_id, re.sub('\s+',' ', result)))
                pbar.update()

HBox(children=(IntProgress(value=0, max=28026), HTML(value='')))


