In [1]:
import numpy as np
import pandas as pd
import bz2
import xml.sax
import mwparserfromhell
import os
import json
import nltk
import csv
from time import time
from itertools import chain
from multiprocessing import Pool
from multiprocessing.dummy import Pool as Threadpool
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Briggstone\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
punctuations = set(['.', ',', ';', ':', '?', '!', '#', '\\', '/', '"', '\'', '\'\'', '´´', '´', '``', '`', '(', ')'])
stop_words = set(stopwords.words('english'))
filters = punctuations.union(stop_words)

# Content handler for the XML parser

In [89]:
class WikiXMLHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._previous_tag = None
        self._pages = []
        self._skip_page = False
        self._punctuations = set(['.', ',', ';', ':', '?', '!', '#', '\\', '/', '"', '\'', '\'\'', '´´', '´', '``', '`', '(', ')'])
        self._stop_words = set(stopwords.words('english'))
        self._filter = self._punctuations.union(self._stop_words)
        
        
    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)
            
            
    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('id', 'title', 'text'):
            self._previous_tag = self._current_tag
            self._current_tag = name
            self._buffer = []
            
        
    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            if name == 'text':
                if self._redirect():
                    self._skip_page = True
                    pass
                else:
                    self._skip_page = False
                self._process_page()
            elif name == 'id' and self._previous_tag == 'id':
                pass
            else:
                self._values[name] = ' '.join(self._buffer)
        if name == 'page':
            if not self._skip_page:
                self._pages.append((self._values['id'],
                                    self._values['title'],
                                    self._values['text'],
                                    self._values['wikilinks']))
                self._page_count = len(self._pages)
    
    
    def _redirect(self):
        wiki = mwparserfromhell.parse(self._buffer)
        text = wiki.strip_code().split()
        if len(text) == 0:
            return False
        return text[0] == 'REDIRECT'
    
    
    def _process_page(self):
        content = mwparserfromhell.parse(self._buffer)
        self._values['wikilinks'] = [x.title.strip_code() for x in content.filter_wikilinks()]
        content = mwparserfromhell.parse(content.strip_code().strip())
        self._values['text'] = list(filter(
            lambda word: word not in self._filter,
            word_tokenize(content.strip_code().strip())))

# Preprocessing

In [90]:
data_folder = 'E:/wikidata/Sample_zip/'
partitions = [data_folder + file for file in os.listdir(data_folder) if 'xml-p']
len(partitions), partitions[-1]

(1,
 'E:/wikidata/Sample_zip/enwiki-20190220-pages-articles-multistream26.xml-p42567204p42663461.bz2')

In [98]:
def preprocess_pages(data_path, save=True):
    """Finds and cleans all pages from a compressed wikipedia XML file"""
    start = time()
    # Object for handling xml
    handler = WikiXMLHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    # Iteratively process file
    i = 0
    file = bz2.BZ2File(data_path, 'r')
    for line in file:
        try:
            parser.feed(line)
        except StopIteration:
            break
        i += 1
        if i > 1e+4: break
    file.close()
    if save:
        temp = []
        for i, page in enumerate(handler._pages):
            temp.append([])
            temp[i].append(page[0])
            temp[i].append([page[1],page[2],page[3]])
            
        json = pd.DataFrame(temp)
        json.to_json('test.json', orient = "records", lines = True)
    
    end = time()
    print(f'\n{data_path} preprocessed in {round(end-start)} seconds')
    print(f'{handler._page_count} pages found in {data_path}')

# Testing

In [99]:
test_file = "E:wikidata/enwiki-20190220-pages-articles-multistream14.xml-p7697599p7744799.bz2"
preprocess_pages(test_file)


E:wikidata/enwiki-20190220-pages-articles-multistream14.xml-p7697599p7744799.bz2 preprocessed in 2 seconds
86 pages found in E:wikidata/enwiki-20190220-pages-articles-multistream14.xml-p7697599p7744799.bz2
