In [2]:
import numpy as np
import pandas as pd
import bz2
import xml.sax
import mwparserfromhell
import os
import json
from time import time
from itertools import chain
from multiprocessing import Pool
from multiprocessing.dummy import Pool as Threadpool

# Content handler for the XML parser

In [3]:
class WikiXMLHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._previous_tag = None
        self._pages = []
        
        
    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)
            
            
    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('id', 'title', 'text'):
            self._previous_tag = self._current_tag
            self._current_tag = name
            self._buffer = []
            
        
    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            if name == 'text':
                self._process_page()
            elif name == 'id' and self._previous_tag == 'id':
                pass
            else:
                self._values[name] = ' '.join(self._buffer)
        if name == 'page':
            if not self._redirect():
                self._pages.append((self._values['id'],
                                    self._values['title'],
                                    self._values['text'],
                                    self._values['wikilinks'],
                                    self._values['extlinks']))
                self._page_count = len(self._pages)
    
    
    def _redirect(self):
        wiki = mwparserfromhell.parse(self._values['text'])
        text = wiki.strip_code().split()
        if len(text) == 0:
            return False
        return text[0] == 'REDIRECT'
    
    
    def _process_page(self):
        content = mwparserfromhell.parse(self._buffer)
        content = content.strip_code().strip()
        content = mwparserfromhell.parse(content)
        self._values['text'] = content.strip_code().strip()
        self._values['wikilinks'] = [x.title.strip_code() for x in content.filter_wikilinks()]
        self._values['extlinks'] = [x.url.strip_code().strip() for x in content.filter_external_links()]

# Preprocessing

In [2]:
data_folder = 'C:/data/'
partitions = [data_folder + file for file in os.listdir(data_folder) if 'xml-p']
len(partitions), partitions[-1]

(56,
 'C:/data/enwiki-20190220-pages-articles-multistream9.xml-p1791081p2336422.bz2')

In [9]:
def preprocess_pages(data_path, save=True):
    """Finds and cleans all pages from a compressed wikipedia XML file"""
    start = time()
    # Object for handling xml
    handler = WikiXMLHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    # Iteratively process file
#     i = 0
    for line in bz2.BZ2File(data_path, 'r'):
#         i += 1
#         if i % 1e+5 == 0: print(handler._page_count)
        try:
            parser.feed(line)
        except StopIteration:
            break

    if save:
        # Save all books to a file based on the data path name
        pass
    
    end = time()
    print(f'\n{data_path} preprocessed in {round(end-start)} seconds')
    print(f'{handler._page_count} pages found in {data_path}')
    return handler._pages

In [None]:
start = time()
# Create a pool of workers to execute processes
pool = Pool(processes = 4)

# Map (service, task), applies function to each partition 
results = pool.map(preprocess_pages, partitions)

pool.close()
pool.join()
end = time()
print(f'\nWhole dump preprocessed in {round(end-start)} seconds')

# Testing

In [4]:
wiki_dump = 'data/enwiki-20190220-pages-articles-multistream1.xml-p10p30302.bz2'
wiki_dump = 'C:/data/enwiki-20190220-pages-articles-multistream1.xml-p10p30302.bz2'

In [6]:
# Object for handling xml
handler = WikiXMLHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
start = time()
# Parse the entire file
i = 0
for line in bz2.BZ2File(wiki_dump):
    try:
        parser.feed(line)
    except StopIteration:
        break
    i += 1
    if i > 1e+4: break
end = time()

print(f'\nSearched through {handler._page_count} pages')
print(f'\nIn {round(end-start)} seconds')


Searched through 10 pages

In 2 seconds


In [7]:
a = handler._pages[8]
print(type(a), len(a))
type(a[0]), type(a[1]), type(a[2]), type(a[3]), type(a[4])

<class 'tuple'> 5


(str, str, str, list, list)

In [69]:
b = {
    'id': a[0],
    'title': a[1],
    'text': a[2],
    'wikilinks': [a[3]],
    'extlinks': [a[4]]
}
b
c = pd.DataFrame(b)
d = c.to_csv('test.csv', sep='\t', index=False)
e = pd.read_csv('test.csv', delimiter='\t', 
                converters={'wikilinks': lambda x: x.strip('[]\'').split(', '),
                            'extlinks': lambda x: x.strip('[]\'').split(', ')})
type(e['wikilinks'])
e['wikilinks'][0][1]
# type(b['wikilinks'])
e

Unnamed: 0,id,title,text,wikilinks,extlinks
0,309,An American in Paris,Themes from An American in Paris\r\nAn America...,"[University of Michigan School of Music, Theat...",[http://www.kennedy-center.org/calendar/?fusea...


In [32]:
[a[0], a[1], a[2], a[3], a[4]]
c = pd.Series([[a[0], a[1], a[2], a[3], a[4]]])
type(c[0][3])

list

In [15]:
i = 0
for line in bz2.BZ2File(wiki_dump):
    print(line)
    i += 1
    if i > 2e+2: break

b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n'
b'  <siteinfo>\n'
b'    <sitename>Wikipedia</sitename>\n'
b'    <dbname>enwiki</dbname>\n'
b'    <base>https://en.wikipedia.org/wiki/Main_Page</base>\n'
b'    <generator>MediaWiki 1.33.0-wmf.17</generator>\n'
b'    <case>first-letter</case>\n'
b'    <namespaces>\n'
b'      <namespace key="-2" case="first-letter">Media</namespace>\n'
b'      <namespace key="-1" case="first-letter">Special</namespace>\n'
b'      <namespace key="0" case="first-letter" />\n'
b'      <namespace key="1" case="first-letter">Talk</namespace>\n'
b'      <namespace key="2" case="first-letter">User</namespace>\n'
b'      <namespace key="3" case="first-letter">User talk</namespace>\n'
b'      <namespace key="4" case="first-letter">Wikipedia</namespace>\n'
b'   

In [51]:
b = preprocess_pages(wiki_dump, False)

258
661
1010
1444
1759
2104
2414
2721
3126
3491
3874
4189
4519
4868
5248
5553
5861
6186
6572
6842
7199
7495
7808
8038
8375
8807
9118
9422
9696
9981
10326
10682
10982
11321
11661
12004
12442
12849
13114
13502
13789
14130
14477
14862
15178

data/enwiki-20190220-pages-articles-multistream1.xml-p10p30302.bz2 preprocessed in 867 seconds

15395 pages found in data/enwiki-20190220-pages-articles-multistream1.xml-p10p30302.bz2


In [26]:
b._pages[0][0]

'12'

In [2]:
130000 % 1e+4

0.0

In [None]:
def do_something(i):
    print(i)
    return i*10

a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

if __name__ == '__main__':
    pool = Pool(processes=4)

    results = pool.map(do_something, a)

    pool.close()
    pool.join()