In [1]:
import numpy as np
import pandas as pd
import bz2
import xml.sax
import mwparserfromhell
import os
import json
import nltk
from time import time
from itertools import chain
from multiprocessing import Pool
from multiprocessing.dummy import Pool as Threadpool
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marku\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
punctuations = set(['.', ',', ';', ':', '?', '!', '#', '\\', '/', '"', '\'', '\'\'', '´´', '´', '``', '`', '(', ')'])
stop_words = set(stopwords.words('english'))
filters = punctuations.union(stop_words)

# Content handler for the XML parser

In [3]:
class WikiXMLHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._previous_tag = None
        self._pages = []
        self._skip_page = False
        
        
    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)
            
            
    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('id', 'title', 'text'):
            self._previous_tag = self._current_tag
            self._current_tag = name
            self._buffer = []
            
        
    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            if name == 'text':
                if self._redirect():
                    self._skip_page = True
                    pass
                else:
                    self._skip_page = False
                self._process_page()
            elif name == 'id' and self._previous_tag == 'id':
                pass
            else:
                self._values[name] = ' '.join(self._buffer)
        if name == 'page':
            if not self._skip_page:
                self._pages.append((self._values['id'],
                                    self._values['title'],
                                    self._values['text'],
                                    self._values['wikilinks']))
                self._page_count = len(self._pages)
    
    
    def _redirect(self):
        wiki = mwparserfromhell.parse(self._buffer)
        text = wiki.strip_code().split()
        if len(text) == 0:
            return False
        return text[0] == 'REDIRECT'
    
    
    def _process_page(self):
        content = mwparserfromhell.parse(self._buffer)
        content = content.strip_code().strip()
        content = mwparserfromhell.parse(content)
        text = content.strip_code().strip()
        words = word_tokenize(text)
        filtered_words = filter(lambda word: word not in filters, words)
        text = [word for word in filtered_words]
        self._values['text'] = text
        self._values['wikilinks'] = [x.title.strip_code() for x in content.filter_wikilinks()]

# Preprocessing

In [4]:
data_folder = 'D:/DAT500-Project-Wiki/data/'
partitions = [data_folder + file for file in os.listdir(data_folder) if 'xml-p']
len(partitions), partitions[-1]

(56,
 'D:/DAT500-Project-Wiki/data/enwiki-20190220-pages-articles-multistream9.xml-p1791081p2336422.bz2')

In [5]:
def preprocess_pages(data_path, save=True):
    """Finds and cleans all pages from a compressed wikipedia XML file"""
    start = time()
    # Object for handling xml
    handler = WikiXMLHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    # Iteratively process file
    i = 0
    for line in bz2.BZ2File(data_path, 'r'):
        try:
            parser.feed(line)
        except StopIteration:
            break
        i += 1
        if i > 1e+4: break

    if save:
        temp = []
        for i, page in enumerate(handler._pages):
            bemp.append([])
            for j, item in enumerate(page):
                if j == 2:
                    b[i].append(j+1+len(item))
                    b[i].extend(item)
                elif j == 3:
                    b[i].extend(item)
                else:
                    b[i].append(item)

        csv = pd.DataFrame(temp)
        csv.to_csv('test.csv', index=False, header=False)
    
    end = time()
    print(f'\n{data_path} preprocessed in {round(end-start)} seconds')
    print(f'{handler._page_count} pages found in {data_path}')

In [None]:
start = time()
# Create a pool of workers to execute processes
pool = Pool(processes = 4)

# Map (service, task), applies function to each partition 
results = pool.map(preprocess_pages, partitions)

pool.close()
pool.join()
end = time()
print(f'\nWhole dump preprocessed in {round(end-start)} seconds')

# Testing

In [13]:
wiki_dump = 'data/enwiki-20190220-pages-articles-multistream1.xml-p10p30302.bz2'
# wiki_dump = 'C:/data/enwiki-20190220-pages-articles-multistream1.xml-p10p30302.bz2'

In [14]:
# Object for handling xml
handler = WikiXMLHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
start = time()
# Parse the entire file
i = 0
for line in bz2.BZ2File(wiki_dump):
    try:
        parser.feed(line)
    except StopIteration:
        break
    i += 1
    if i > 1e+4: break
end = time()

print(f'\nSearched through {handler._page_count} pages')
print(f'\nIn {round(end-start)} seconds')


Searched through 10 pages

In 4 seconds


In [21]:
a = handler._pages[8:10]
# print(type(a), len(a))
# type(a[0]), type(a[1]), type(a[2]), type(a[3])

In [34]:
import csv

In [41]:
len(a[1][2])

b = []
for i, p in enumerate(a):
    b.append([])
    for j, s in enumerate(p):
        if j == 2:
            b[i].append(j+1+len(s))
            b[i].extend(s)
        elif j == 3:
            b[i].extend(s)
        else:
            b[i].append(s)

csvData = [['Person', 'Age', 'test'], ['Peter', '22', 'fdsafa', 'sdad'], ['Jasmine', '21'], ['Sam', '24']]
            
with open('test.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(b)

csvFile.close()


# c = pd.DataFrame(b)
# c.to_csv('test.csv', index=False, header=False)
# d = pd.read_csv('test.csv', header=None)
# d

UnicodeEncodeError: 'charmap' codec can't encode character '\u0144' in position 8776: character maps to <undefined>

In [32]:
# b[0]

In [None]:
b[0][3:b[2]]
b[0][b[2]:]

In [52]:
temp = []
for page in handler._pages:
    temp.append({
        'id': page[0],
        'title': page[1],
        'text': [page[2]],
        'wikilinks': [page[3]],
        'extlinks': [page[4]]
    })
csv = pd.DataFrame(temp,
                   columns=['id', 'title', 'text', 'wikilinks', 'extlinks'])

In [125]:
c.to_csv('test.csv', sep='\t', index=False)
d = pd.read_csv('test.csv', delimiter='\t', 
                converters={'text': lambda x: x.strip('[]').split(', '),
                            'wikilinks': lambda x: x.strip('[]').split(', '),
                            'extlinks': lambda x: x.strip('[]').split(', ')})
d

Unnamed: 0,id,title,text,wikilinks,extlinks
0,12,Anarchism,"['Anarchism', 'anti-authoritarian', 'political...","['File:WilliamGodwin.jpg', 'File:Bakunin.png',...",['http://www.britannica.com/eb/article-9117285...
1,25,Autism,"['Autism', 'developmental', 'disorder', 'chara...","['John Wiley & Sons', 'File:Single Chromosome ...",['https://www.nimh.nih.gov/health/topics/autis...
2,39,Albedo,"['thumb|upright=1.3|The', 'percentage', 'diffu...",['File:water reflectivity.jpg'],['http://web.cse.ohio-state.edu/~parent.1/clas...
3,290,A,"['A', 'named', 'plural', 'As', 'A', ""'s"", ""'s""...","['File:Cretan-1.jpg', 'Aleph', 'File:Cretan-1....",['https://books.google.com/books?id=n2QWAAAAYA...
4,303,Alabama,"['Alabama', 'state', 'southeastern', 'region',...","['Natural Bridge, Alabama', 'AT&T Inc.', 'AT&T...",['http://www.oed.com/view/Entry/248152?redirec...
5,305,Achilles,"['thumb|300px|Ancient', 'Greek', 'polychromati...",['File:The Education of Achilles 1862 Delacroi...,['http://epigraphy.packhum.org/inscriptions/se...
6,307,Abraham Lincoln,"['Abraham', 'Lincoln', 'February', '12', '1809...","['Sarah Bush Lincoln', 'William Wallace Lincol...",['https://quod.lib.umich.edu/j/jala/2629860.00...
7,308,Aristotle,"['Aristotle', 'Aristotélēs', '384–322', 'BC', ...","['State of matter', 'File:Scyliorhinus retifer...",['https://books.google.com/?id=ZB-rVxPvtPEC&pg...
8,309,An American in Paris,"['Themes', 'An', 'American', 'Paris', 'An', 'A...","['University of Michigan School of Music, Thea...",['http://www.kennedy-center.org/calendar/?fuse...
9,316,Academy Award for Best Production Design,"['The', 'Academy', 'Award', 'Best', 'Productio...",['Pride and Prejudice (2005 film)'],"['http://awardsdatabase.oscars.org/', 'https:/..."


In [127]:
q = d.loc[0, 'text'][0]
# c.loc[0, 'text'][0]

In [128]:
q.strip('\'')

'Anarchism'

In [144]:
# d.loc[0, 'extlinks'] = [word.strip('\'') for word in d.loc[0, 'extlinks']]
# d.loc[0, 'extlinks']

type([word.strip('\'') for word in d.loc[0, 'extlinks']])
type(d.loc[0, 'extlinks'])

list

In [None]:
i = 0
for line in bz2.BZ2File(wiki_dump):
    print(line)
    i += 1
    if i > 2e+2: break

In [160]:
np.random.choice([1, 2, 3])

3

In [161]:
np.random.choice([1, 2, 3])

2

In [None]:
test01 = open('clean-data/test01.csv', 'r')
i = 0
for line in test01:
    print(line)
    if i > 10: break 
    else: i+=1

In [27]:
test02 = open('test.csv', 'r')
i = 0
for line in test02:
    print(line)
    if i > 10: break 
    else: i+=1

309,An American in Paris,1278,Themes,An,American,Paris,An,American,Paris,jazz-influenced,orchestral,piece,American,composer,George,Gershwin,written,1928,It,inspired,time,Gershwin,spent,Paris,evokes,sights,energy,French,capital,1920s,Gershwin,composed,An,American,Paris,commission,conductor,Walter,Damrosch,He,scored,piece,standard,instruments,symphony,orchestra,plus,celesta,saxophones,automobile,horns,He,brought,back,Parisian,taxi,horns,New,York,premiere,composition,took,place,December,13,1928,Carnegie,Hall,Damrosch,conducting,New,York,Philharmonic.ALAN,GILBERT,AND,THE,NEW,YORK,PHILHARMONIC,MAKOTO,OZONE,To,Perform,GERSHWIN,â€™,S,RHAPSODY,IN,BLUE,One-Night-Only,Concert,All-American,Program,Also,To,Include,BERNSTEIN,â€™,Candide,Overture,Symphonic,Dances,West,Side,Story,GERSHWIN,'s,An,American,Paris,April,22,2014,nyphil.org,Accessed,June,20,2017,He,completed,orchestration,November,18,less,four,weeks,work,'s,premiere,He,collaborated,original,program,notes,critic,composer,Deems,Taylor,Backgro

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 2355: character maps to <undefined>