### WikiEngine

In [1]:
import bz2
import os
import xml.etree.ElementTree as ET
import mwparserfromhell as mph
import re

class WikiDump():
    def __init__(self, path_xml, path_idx):
        self._idx = {}
        self._links = []
        self._page = None
        self.path_xml = path_xml
        self.path_idx = path_idx
        
    def get_idx(self):
        if self._idx:
            return self._idx
        else:
            print('WikiDump: Loading index...')
            with bz2.BZ2File(path_index, 'rb') as file:
                lines = [line for line in file]
            block_end = os.path.getsize(self.path_xml)
            offset_prev = block_end
            for line in reversed(lines):
                [offset, pid, name] = line.strip().split(b':', 2)
                offset, pid, name = (int(offset), int(pid), name.decode('utf8'))
                block_end = offset_prev if offset < offset_prev else block_end
                self._idx[name] = (offset, pid, block_end-offset)
                offset_prev = offset
            print('WikiDump: Loaded.')
            return self._idx
    idx = property(get_idx)
    
    def get_links(self):
        if self._links:
            return self._links
        elif self.page:
            self._links = [x.title for x in self.page.filter_wikilinks()]
            return self._links
    links = property(get_links)
    
    def get_page(self):
        return self._page
    
    def set_page(self, page):
        self._page = page
        self._links = []
    page = property(get_page, set_page)
    
    def load_page(self, page_name, filter_top=False):
        if page_name not in self.idx.keys():
            self.page = None
            return
        offset, pid, block_size = self.idx[page_name]
        xml = WikiDump.fetch_block(self.path_xml, offset, block_size)
        root = ET.fromstring(b'<root>' + xml + b'</root>')
        text = WikiDump.search_id(root, pid)
        text = WikiDump.filter_top_section(text) if filter_top else text
        self.page = mph.parse(text)
        return self.page
    
    @staticmethod
    def fetch_block(path, offset, block_size):
        with open(path, 'rb') as file:
            file.seek(offset)
            return bz2.decompress(file.read(block_size))
    
    @staticmethod
    def search_id(root, pid):
        for page in root.iter('page'):
            if pid == int(page.find('id').text):
                return page.find('revision').find('text').text
    
    @staticmethod
    def filter_top_section(text):
        head = re.search(r'==.*?==', text)
        idx = head.span(0)[0] if head else len(text)
        return text[:idx] #(text[:idx], text[idx:])

In [2]:
path_base = '/Users/harangju/Developer/data/wiki/'
name_xml = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
name_index = 'enwiki-20190801-pages-articles-multistream-index.txt.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
dump = WikiDump(path_xml, path_index)

### Exploring the wiki dump

In [3]:
dump.load_page('Portal:Physics/Topics')
dump.links[:5]

WikiDump: Loading index...
WikiDump: Loaded.


['Classical physics', 'mechanics', 'optics', 'electricity', 'magnetism']

In [4]:
dump.load_page('Matter', filter_top=True)
dump.links[5:10]

['plasma (physics)', 'quartz', 'solid', 'water', 'liquid']

### Wikipedia hypernet traversal

In [16]:
import networkx as nx

class WikiCrawler():
    @staticmethod
    def bfs(graph, dump, queue, depth_goal=1):
        depth = 0
        depth_num_items = len(queue)
        depth_inc_pending = False
        while queue:
            page = queue.pop(0)
            depth_num_items -= 1
            if depth_num_items == 0:
                depth += 1
                print('Depth: ' + str(depth))
                depth_inc_pending = True
            if dump.load_page(page, filter_top=True):
                for link in dump.links:
                    link = str(link).split('#')[0].capitalize()
                    if (page, link) not in graph.edges:
                        graph.add_edge(link, page)
                        queue.append(link)
            if depth_inc_pending:
                depth_num_items = len(queue)
                depth_inc_pending = False
            if depth == depth_goal:
                break

In [None]:
graph = nx.DiGraph()
queue = ['Matter']
WikiCrawler.bfs(graph, dump, queue, 4)

In [14]:
from webweb import Web

edge_list = [e for e in graph.edges]
Web(edge_list).show()