In [1]:
import pickle
import os 
from tqdm import tqdm_notebook as tqdm
import json
import time

In [2]:
import lxml.etree as ET
from smart_open import smart_open
import msgpack


class GeneratorTextsXML:
    def __init__(self, dir_path):
        self._dir_path = dir_path
    
    def __iter__(self):
        for fname in os.listdir(self._dir_path):
            print(fname)
            file_path = os.path.join(self._dir_path, fname)
            if not os.path.isfile(file_path):
                continue
            with open(file_path, 'rb') as f:
                for event, elem in ET.iterparse(f, html=True, tag='doc', encoding='utf8'):
                    if elem.text is not None:
                        yield elem.text
                        

class GeneratorTextsJSON:
    def __init__(self, dir_path):
        self._dir_path = dir_path
    
    def __iter__(self):
        for fname in os.listdir(self._dir_path):
            print(fname)
            file_path = os.path.join(self._dir_path, fname)
            if not os.path.isfile(file_path):
                continue
                
            with smart_open(file_path, 'r', encoding='utf8') as f:
                for line in f:
                    yield json.loads(line)['text']
                    
                    
def parse_wiki(gen, output_path, chunk_size):
    chunk_iter = 0
    while True:
        try:
            chunk = []
            for _ in range(chunk_size):
                chunk.append(next(gen_iter))
        except StopIteration as e:
            break
        finally:
            chunk_iter += 1
            print('#Iter: ', chunk_iter)

            res = ppl(chunk)

            file_path = os.path.join(output_path, str(chunk_iter) + '.pckl')
            with open(file_path, 'wb') as f:
                f.write(msgpack.packb([e['lemma'] for e in res], use_bin_type=True))
                
                
                
def test_extracted_files(data_path, start, plus):
    for num, e in enumerate(GeneratorTexts(data_path)):
        print(num)
        if num >= start and num < start + plus + 1:
            print('============')
            print(e)

        if num == start + 1:
            break

In [3]:
from isanlp.processor_sentence_splitter import ProcessorSentenceSplitter
from isanlp.ru.processor_tokenizer_ru import ProcessorTokenizerRu
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp import PipelineCommon
from isanlp.wrapper_multi_process_document import WrapperMultiProcessDocument


ppl_single_creator = lambda : PipelineCommon([
    (ProcessorTokenizerRu(), ['text'], {0 : 'tokens'}),
    (ProcessorSentenceSplitter(), ['tokens'], {0 : 'sentences'}),
    (ProcessorMystem(), ['tokens', 'sentences'], {'lemma' : 'lemma'})
])

ppl = WrapperMultiProcessDocument([ppl_single_creator() for _ in range(10)], progress_bar=None)

In [None]:
data_path = '/notebook/projects/compounds/workdir/extracted/AA/'
gen = GeneratorTexts(data_path)
gen_iter = iter(gen)

output_path = '/notebook/projects/compounds/workdir/parse/'
chunk_size = 2000

In [4]:
data_path = '/notebook/projects/compounds/workdir/wiki_json/AA'
gen = GeneratorTextsJSON(data_path)
gen_iter = iter(gen)

output_path = '/notebook/projects/compounds/workdir/parse5/'
chunk_size = 20000
parse_wiki(gen, output_path, chunk_size)

wiki_02
#Iter:  1
#Iter:  2
#Iter:  3
#Iter:  4
#Iter:  5
#Iter:  6
#Iter:  7
#Iter:  8
#Iter:  9
#Iter:  10
#Iter:  11
#Iter:  12
#Iter:  13
#Iter:  14
#Iter:  15
#Iter:  16
wiki_03
#Iter:  17
#Iter:  18
#Iter:  19
#Iter:  20
#Iter:  21
#Iter:  22
#Iter:  23
#Iter:  24
#Iter:  25
#Iter:  26
#Iter:  27
#Iter:  28
#Iter:  29
#Iter:  30
#Iter:  31
#Iter:  32
#Iter:  33
#Iter:  34
wiki_04
#Iter:  35
#Iter:  36
#Iter:  37
#Iter:  38
#Iter:  39
#Iter:  40
#Iter:  41
#Iter:  42
#Iter:  43
#Iter:  44
#Iter:  45
#Iter:  46
#Iter:  47
#Iter:  48
#Iter:  49
#Iter:  50
wiki_05
#Iter:  51
#Iter:  52
#Iter:  53
#Iter:  54
#Iter:  55
#Iter:  56
wiki_01
#Iter:  57
#Iter:  58
#Iter:  59
#Iter:  60
#Iter:  61
#Iter:  62
#Iter:  63
#Iter:  64
#Iter:  65
#Iter:  66
#Iter:  67
#Iter:  68
#Iter:  69
wiki_00
#Iter:  70
#Iter:  71
#Iter:  72
#Iter:  73
#Iter:  74
#Iter:  75
#Iter:  76
#Iter:  77
#Iter:  78
