### Dependencies

In [None]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            if self._current_tag == 'id' and self._buffer:
                return
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'id'):
            if name == 'id' and name in self._values.keys():
                return
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

### Engine

In [None]:
import bz2
import mwparserfromhell as mph
import re

class WikiEngine():
    def __init__(self, path_xml, path_idx):
        # path_xml is the path to the XML bz2 file
        # path_idx is the path to the index bz2 file
        self._idx = None
        self._block_sizes = None
        self._links = None
        self.page = None
        self.path_xml = path_xml
        self.path_idx = path_idx
        self.handler = WikiXmlHandler()
        self.parser = xml.sax.make_parser()
        self.parser.setContentHandler(self.handler)
        
    def get_idx(self):
        if self._idx:
            return self._idx
        elif self.path_idx:
            print('Loading index...')
            self._idx = {}
            with bz2.BZ2File(self.path_idx, 'rb') as file:
                for line in file:
                    [offset, page_id, name] = line.strip().split(b':', 2)
                    self._idx[name.decode('utf-8')] = (int(offset), int(page_id))
            return self._idx
    idx = property(get_idx)
    
    def get_block_sizes(self):
        if self._block_sizes:
            return self._block_sizes
        elif self._idx:
            print('Calculating block sizes...')
            offsets = [x[0] for x in self._idx.values()]
            diff = [i-j for i, j in zip(offsets[1:], offsets[:-1])]
            self._block_sizes = list(filter(lambda a: a != 0, diff))
            return self._block_sizes
    block_sizes = property(get_block_sizes)
    
    def get_links(self):
        if self._links:
            return self._links
        elif self.page:
            self._links = [x.title for x in self.page.filter_wikilinks()]
            return self._links
    links = property(get_links)
    
    def load_page(self, page_name):
        if page_name not in self.idx.keys():
            return
        page_offset, page_id = self.idx[page_name]
        print('Searching for page "' + page_name + '"'
              ' with id ' + str(page_id) + '...')
        xml = WikiEngine.search_dump(self.path_xml, page_id, page_offset).decode('utf-8')
        xml = WikiEngine.strip_manual_ref(xml)
        print('Loaded.')
        print('Parsing XML...')
        self.parser.feed(xml)
        print('Parsing wiki (only the top section)...')
#         text = WikiEngine.filter_top_section(self.handler._values['text'])
        self.page = mph.parse(self.handler._values['text'])
        print('Parsed.')
        self._links = None
        self.parser.reset()
        return self.page
    
    @staticmethod
    def fetch_dump(path, offset, block_size):
        with open(path, 'rb') as file:
            file.seek(offset)
            data = file.read(block_size)
            return bz2.decompress(data)
    
    @staticmethod
    def search_dump(path, page_id, offset, block_size):
#         page_found = False
#         xml = b''
#         page_start = 0
#         max_search = 100e6
        with bz2.BZ2File(path, 'rb') as file:
            file.seek(offset)
            print(file.tell())
            while (file.tell() - offset) < max_search:
                line = file.readline()
                if b'<page>' in line:
                    xml = b''
                    page_start = file.tell() - len(line)
                xml = xml + line
                if b'<id>' + str(page_id).encode('utf-8') + b'</id>' in line:
                    print(page_start)
#                     print('Found at byte offset ' + page_start + '.')
                    page_found = True
                if b'</page>' in line and page_found:
                    return xml
        raise NameError('Page not found.')
    
    @staticmethod
    def strip_manual_ref(text):
        return re.sub(r'&lt;/*ref.*?(/&gt;|&gt;)', '', text)
    
    @staticmethod
    def filter_top_section(text):
        head = re.search(r'==.*?==', text)
        idx = head.span(0)[0] if head else len(text)
        return (text[:idx], text[idx:])

### Test

In [None]:
base_path = '/Users/harangju/Developer/data/wiki/partition/'
xml_name = 'enwiki-20190720-pages-articles-multistream1.xml-p10p30302.bz2'
index_name = 'enwiki-20190720-pages-articles-multistream-index1.txt-p10p30302.bz2'
xml_path = base_path + xml_name
index_path = base_path + index_name
wiki = WikiEngine(xml_path, index_path)

In [None]:
wiki.block_sizes

In [None]:
wiki.load_page('AccessibleComputing')
# wiki.load_page('Artificial languages')
# wiki.load_page('Abstract (law)')
# wiki.load_page('Anxiety')
# wiki.load_page('Foreign relations of Azerbaijan')
# wiki.load_page('Alfonso Cuarón')
# wiki.load_page('ADHD')
print('Number of links: ' + str(len(wiki.links)))

In [None]:
x = [1, 2, 3, 4]
x[:-1]

In [None]:
with open(xml_path, 'rb') as file:
    file.seek(617)
    data = file.read(1000)
text = bz2.decompress(data)
text[:100]