In [1]:
import numpy as np
import pandas as pd
import bz2
import xml.sax
import mwparserfromhell
from time import time

In [2]:
wiki_dump = 'enwiki-20190101-pages-articles-multistream.xml.bz2'
# index = 'enwiki-20190101-pages-articles-multistream-index.txt.bz2'

In [3]:
class WikiXMLHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []
        self._books = []
        
        
    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)
            
            
    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text'):
            self._current_tag = name
            self._buffer = []
            
        
    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)
        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))
            self._article_count = len(self._pages)
            # Send the page to the process article function
            book = process_article(**self._values,
                                   template = 'Infobox book')
            # If article is a book append to list of books
            if book:
                self._books.append(book)

In [6]:
# Object for handling xml
handler = WikiXMLHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

# Iteratively process file
for line in bz2.BZ2File(wiki_dump, 'r'):
    parser.feed(line)
    
    # Stop when 3 articles have been found
    if len(handler._pages) > 2:
        break

In [7]:
handler._pages[0]

('AccessibleComputing',
 '#REDIRECT [[Computer accessibility]] \n \n {{R from move}} \n {{R from CamelCase}} \n {{R unprintworthy}}')

In [8]:
# Create the wiki article
wiki = mwparserfromhell.parse(handler._pages[0][1])

# Find the wikilinks
wikilinks = [x.title for x in wiki.filter_wikilinks()]
wikilinks[:5]

['Computer accessibility']

In [9]:
wiki.strip_code().strip()

'REDIRECT Computer accessibility'

In [10]:
wiki.filter_templates('Infobox book')

['{{R from move}}', '{{R from CamelCase}}', '{{R unprintworthy}}']

In [11]:
def process_article(title, text, template='Infobox book'):
    """Process a wikipedia article looking for template"""
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    # Search through templates for the template
    matches = wikicode.filter_templates(matches=template)
    if len(matches) >= 1:
        # Extract information from infobox
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip()
                      for param in matches[0].params
                      if param.value.strip_code().strip()}
        # Extract internal wikilinks
        wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
        # Extract external links
        exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()]
        return (title, properties, wikilinks, exlinks)

In [13]:
# Object for handling xml
handler = WikiXMLHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
start = time()
# Parse the entire file
i = 0
for line in bz2.BZ2File(wiki_dump):
    try:
        parser.feed(line)
    except StopIteration:
        break
    i += 1
    if i > 1e+5: break
end = time()
books = handler._books

print(f'\nSearched through {handler._article_count} articles')
print(f'\nFound {len(books)} books in {round(end-start)} seconds')


Searched through 409 articles

Found 4 books in 8 seconds


In [14]:
books[3]

('A Clockwork Orange (novel)',
 {'name': 'A Clockwork Orange',
  'image': 'Clockwork orange.jpg',
  'caption': 'Dust jacket from the first edition',
  'cover_artist': 'Barry Trengrove < ref > Urgent Copy exhibition: A Clockwork Orange (1962) | International Anthony Burgess Foundation Retrieved 2015-11-26. < /ref >',
  'author': 'Anthony Burgess',
  'country': 'United Kingdom',
  'language': 'English < br > Nadsat',
  'genre': 'Science fiction, Dystopian fiction, Satire, Black Comedy',
  'published': '1962 (William Heinemann, UK)',
  'media_type': 'Print (hardback  &  paperback)  &  audio book (cassette, CD)',
  'pages': '192 pages (hardback edition)  &   < br/ >  176 pages (paperback edition)',
  'isbn': '0-434-09800-0',
  'oclc': '4205836'},
 ['Anthony Burgess',
  'Nadsat',
  'Science fiction',
  'Dystopian fiction',
  'Satire',
  'Black Comedy',
  'Heinemann (book publisher)',
  'hardback',
  'paperback',
  'Compact audio cassette',
  'Compact Disc',
  'Dystopian fiction',
  'Satire'