In [1]:
import json_lines
import gzip
import json
import traceback
import sys
import multiprocessing

from soft404.utils import cleaned_selector, selector_to_text, get_text_blocks


def convert_item(item):
    try:
        sel = cleaned_selector(item.pop('html'))
        text_item = {
            'url': item['url'],
            'text': selector_to_text(sel),
            'title': ' '.join(sel.xpath('/html/head/title//text()').extract()),
            'status': item['status'],
        }
        body = sel.xpath('/html/body')
        if body:
            text_item['blocks'] = get_text_blocks(body[0].root)
    except Exception:
        return None
    else:
        return text_item

        
with gzip.open('../text_items.jl.gz', 'wt') as outf:
    with json_lines.open('../pages.jl.gz', broken=True) as f:
        n_errors = 0
        with multiprocessing.Pool() as pool:
            for text_item in pool.imap_unordered(convert_item, f, chunksize=1000):
                if text_item is None:
                    n_errors += 1
                else:
                    outf.write(json.dumps(text_item))
                    outf.write('\n')
                    
print('Number of erorrs: {}'.format(n_errors))



Number of erorrs: 0


In [51]:
text_item = convert_item(item)
in_text = json.dumps(text_item)
import pickle, base64
in_pkl = pickle.dumps(text_item, protocol=-1)
in_pkl_b64 = base64.b64encode(pickle.dumps(text_item, protocol=-1))

N = 1000
print('json')
%time for _ in range(N): json.loads(in_text)
print('\nb64 + pickle')
%time for _ in range(N): pickle.loads(base64.b64decode(in_pkl_b64))
print('\npickle')
%time for _ in range(N): pickle.loads(in_pkl)

json
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.05 ms

b64 + pickle
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.21 ms

pickle
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.23 ms


In [72]:
import struct
def write_pickle_stream(itme, outf):
    data = pickle.dumps(item)
    outf.write(struct.pack('i', len(data)))
    outf.write(data)

def pickle_stream_reader(outf):
    while True:
        size_data = outf.read(4)
        if not size_data:
            break
        size, = struct.unpack('i', size_data)
        yield pickle.loads(outf.read(size))

In [63]:
with json_lines.open('../text_items.jl.gz') as f:
    with open('../text_items.pkls', 'wb') as outf:
        for item in f:
            write_pickle_stream(item, outf)
        
! ls -lh ../text_items.*

-rw-r--r-- 1 kostia kostia 108M сен 12 15:19 ../text_items.jl.gz
-rw-rw-r-- 1 kostia kostia 499M сен 12 15:39 ../text_items.pkls


In [64]:
%%time
with json_lines.open('../text_items.jl.gz') as f:
    print(sum(1 for _ in f))

26461
CPU times: user 11.9 s, sys: 100 ms, total: 12 s
Wall time: 12 s


In [73]:
%%time
with open('../text_items.pkls', 'rb') as f:
    print(sum(1 for _ in pickle_stream_reader(f)))

26461
CPU times: user 3.38 s, sys: 124 ms, total: 3.5 s
Wall time: 3.5 s


In [6]:
import json_lines

with json_lines.open('../pages.jl.gz', broken=True) as f:
    for i, item in enumerate(f):
        if i > 100:
            break

In [21]:
from soft404.utils import _cleaned_html_tree
tree = _cleaned_html_tree(item['html'])
sel = parsel.Selector(root=tree, type='html')

In [31]:
# this is not quite what I want - I want to avoid breaking text on inline nodes
len(list(x for x in (x.strip() for x in tree.itertext()) if x))

212

In [37]:
from soft404.utils import get_text_blocks

text_blocks = get_text_blocks(tree)
len(text_blocks), text_blocks

(154,
 [('', 'VK on the App Store'),
  ('li', 'Open Menu Close Menu'),
  ('li', 'Apple'),
  ('li', 'Shopping Bag'),
  ('li', 'Apple'),
  ('li', 'Mac'),
  ('li', 'iPad'),
  ('li', 'iPhone'),
  ('li', 'Watch'),
  ('li', 'TV'),
  ('li', 'Music'),
  ('li', 'Support'),
  ('li', 'Search apple.com'),
  ('li', 'Shopping Bag'),
  ('li', 'Overview'),
  ('li', 'Music'),
  ('li', 'Video'),
  ('li', 'Charts'),
  ('div',
   "Opening the iTunes Store. If iTunes doesn't open, click the iTunes application icon in your Dock or on your Windows desktop. Progress Indicator"),
  ('div',
   "Opening the iBooks Store. If iBooks doesn't open, click the iBooks app in your Dock. Progress Indicator"),
  ('div', 'iTunes'),
  ('h2',
   "iTunes is the world's easiest way to organize and add to your digital media collection."),
  ('p',
   'We are unable to find iTunes on your computer. To download the free app VK by VKontakte, get iTunes now.'),
  ('p',
   'Already have iTunes? Click I Have iTunes to open it now. iTu