In [1]:
import numpy as np
import pandas as pd
import bz2
import xml.sax
import mwparserfromhell
import os
import json
import nltk
import csv
from time import time
from itertools import chain
from multiprocessing import Pool
from multiprocessing.dummy import Pool as Threadpool
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
punctuations = set(['.', ',', ';', ':', '?', '!', '#', '\\', '/', '"', '\'', '\'\'', '´´', '´', '``', '`', '(', ')'])
stop_words = set(stopwords.words('english'))
filters = punctuations.union(stop_words)

# Content handler for the XML parser

In [4]:
class WikiXMLHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._previous_tag = None
        self._pages = []
        self._skip_page = False
        self._punctuations = set(['.', ',', ';', ':', '?', '!', '#', '\\', '/', '"', '\'', '\'\'', '´´', '´', '``', '`', '(', ')'])
        self._stop_words = set(stopwords.words('english'))
        self._filter = self._punctuations.union(self._stop_words)
        
        
    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)
            
            
    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('id', 'title', 'text'):
            self._previous_tag = self._current_tag
            self._current_tag = name
            self._buffer = []
            
        
    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            if name == 'text':
                if self._redirect():
                    self._skip_page = True
                    pass
                else:
                    self._skip_page = False
                self._process_page()
            elif name == 'id' and self._previous_tag == 'id':
                pass
            else:
                self._values[name] = ' '.join(self._buffer)
        if name == 'page':
            if not self._skip_page:
                self._pages.append((self._values['id'],
                                    self._values['title'],
                                    self._values['text'],
                                    self._values['wikilinks']))
                self._page_count = len(self._pages)
    
    
    def _redirect(self):
        wiki = mwparserfromhell.parse(self._buffer)
        text = wiki.strip_code().split()
        if len(text) == 0:
            return False
        return text[0] == 'REDIRECT' or text[0] == 'OMDIRIGERING'
    
    
    def _process_page(self):
        content = mwparserfromhell.parse(self._buffer)
        self._values['wikilinks'] = [x.title.strip_code() for x in content.filter_wikilinks()]
        content = mwparserfromhell.parse(content.strip_code().strip())
        self._values['text'] = list(filter(
            lambda word: word not in self._filter,
            word_tokenize(content.strip_code().strip())))

# Preprocessing

In [5]:
data_folder = 'D:/GitHub/DAT500-Project-Wiki/data_no/'
partitions = [data_folder + file for file in os.listdir(data_folder) if 'xml-p']
len(partitions), partitions[-1]

(1,
 'D:/GitHub/DAT500-Project-Wiki/data_no/nowiki-20190320-pages-articles-multistream.xml.bz2')

In [4]:
def preprocess_pages(data_path, save=True):
    """Finds and cleans all pages from a compressed wikipedia XML file"""
    start = time()
    # Object for handling xml
    handler = WikiXMLHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    # Iteratively process file
    i = 0
    file = bz2.BZ2File(data_path, 'r')
    for line in file:
        try:
            parser.feed(line)
        except StopIteration:
            break
        i += 1
        if i > 1e+4: break
    file.close()
    if save:
        temp = []
        for i, page in enumerate(handler._pages):
            bemp.append([])
            for j, item in enumerate(page):
                if j == 2:
                    b[i].append(j+1+len(item))
                    b[i].extend(item)
                elif j == 3:
                    b[i].insert(3, len(item))
                    b[i].extend(item)
                else:
                    b[i].append(item)

        csv = pd.DataFrame(temp)
        csv.to_csv('test.csv', index=False, header=False)
    
    end = time()
    print(f'\n{data_path} preprocessed in {round(end-start)} seconds')
    print(f'{handler._page_count} pages found in {data_path}')

In [None]:
start = time()
# Create a pool of workers to execute processes
pool = Pool(processes = 4)

# Map (service, task), applies function to each partition 
results = pool.map(preprocess_pages, partitions)

pool.close()
pool.join()
end = time()
print(f'\nWhole dump preprocessed in {round(end-start)} seconds')

# Testing

In [6]:
wiki_dump = 'data_no/nowiki-20190320-pages-articles-multistream.xml.bz2'
# wiki_dump = 'C:/data/enwiki-20190220-pages-articles-multistream1.xml-p10p30302.bz2'

In [7]:
# Object for handling xml
handler = WikiXMLHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
start = time()
# Parse the entire file
i = 0
file = bz2.BZ2File(wiki_dump)
for line in file:
    try:
        parser.feed(line)
        print(line)
    except StopIteration:
        break
    i += 1
    if i > 1e+4: break
file.close()
end = time()

print(f'\nSearched through {handler._page_count} pages')
print(f'\nIn {round(end-start)} seconds')

b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="nb">\n'
b'  <siteinfo>\n'
b'    <sitename>Wikipedia</sitename>\n'
b'    <dbname>nowiki</dbname>\n'
b'    <base>https://no.wikipedia.org/wiki/Portal:Forside</base>\n'
b'    <generator>MediaWiki 1.33.0-wmf.21</generator>\n'
b'    <case>first-letter</case>\n'
b'    <namespaces>\n'
b'      <namespace key="-2" case="first-letter">Medium</namespace>\n'
b'      <namespace key="-1" case="first-letter">Spesial</namespace>\n'
b'      <namespace key="0" case="first-letter" />\n'
b'      <namespace key="1" case="first-letter">Diskusjon</namespace>\n'
b'      <namespace key="2" case="first-letter">Bruker</namespace>\n'
b'      <namespace key="3" case="first-letter">Brukerdiskusjon</namespace>\n'
b'      <namespace key="4" case="first-letter">Wikipedia</

In [10]:
text = "The '''1974 European Judo Championships''' were the 23rd edition of the [[European Judo Championships]], and were held in [[London]], [[Great Britain]] on 5 May 1974.\n"

In [11]:
content = mwparserfromhell.parse(
    mwparserfromhell
    .parse(text)
    .strip_code()
    .strip())
te = list(filter(
    lambda word: word not in filters,
    word_tokenize(content.strip_code().strip())))
wiki = [x.title.strip_code() for x in content.filter_wikilinks()]

In [14]:
content = mwparserfromhell.parse(text)
[x.title.strip_code() for x in content.filter_wikilinks()]

['European Judo Championships', 'London', 'Great Britain']

In [19]:
a = handler._pages
# print(type(a), len(a))
# type(a[0]), type(a[1]), type(a[2]), type(a[3])

In [20]:
a[5]

('33952842',
 '1974 European Judo Championships',
 ['The',
  '1974',
  'European',
  'Judo',
  'Championships',
  '23rd',
  'edition',
  'European',
  'Judo',
  'Championships',
  'held',
  'London',
  'Great',
  'Britain',
  '5',
  'May',
  '1974',
  'Medal',
  'overview',
  '63',
  'kgSergey',
  'MelnichenkoDanny',
  'Da',
  'CostaMichel',
  'AlgisiShengeli',
  'Pitskhelauri70',
  'kgGünther',
  'KruegerValery',
  'DvoinikovEngelbert',
  'DörbandtGérard',
  'Gautier80',
  'kgJean-Paul',
  'CocheAntoni',
  'ReiterAdam',
  'AdamczykBob',
  'Debelius93',
  'kgGoran',
  'ZuvelaGünter',
  'NeureutherDavid',
  'StarbrookDietmar',
  'Lorenz93+',
  'kgGivi',
  'OnashviliChris',
  'DolmanRémi',
  'BerthetWolfgang',
  'ZueckschwerdtOpen',
  'classSergey',
  'NovikovShota',
  'ChochishviliImre',
  'VargaWolfgang',
  'Zueckschwerdt',
  'Medal',
  'table',
  'References',
  'Results',
  '1974',
  'European',
  'Judo',
  'Championships',
  'JudoInside.com',
  'E',
  'Category',
  'European',
  'Ju

In [157]:
len(a[1][2])

with open('test.csv', 'w', encoding='utf-8') as csvFile:
    writer = csv.writer(csvFile)
    for i, p in enumerate(a):
        b = []
        for j, s in enumerate(p):
            if j == 2:
                b.append(4+len(s))
                b.extend(s)
            elif j == 3:
                b.insert(3, len(s))
                b.extend(s)
            else:
                b.append(s)
        writer.writerow(b)
    csvFile.close()

d = []
f = open("test.csv", "r", encoding='utf-8')
for line in f:
    if len(line.strip()) > 0:
        d.append(line.strip().split(','))
len(d)

10

In [None]:
b[0][3:b[2]]
b[0][b[2]:]

In [62]:
# a

In [125]:
c.to_csv('test.csv', sep='\t', index=False)
d = pd.read_csv('test.csv', delimiter='\t', 
                converters={'text': lambda x: x.strip('[]').split(', '),
                            'wikilinks': lambda x: x.strip('[]').split(', '),
                            'extlinks': lambda x: x.strip('[]').split(', ')})
d

Unnamed: 0,id,title,text,wikilinks,extlinks
0,12,Anarchism,"['Anarchism', 'anti-authoritarian', 'political...","['File:WilliamGodwin.jpg', 'File:Bakunin.png',...",['http://www.britannica.com/eb/article-9117285...
1,25,Autism,"['Autism', 'developmental', 'disorder', 'chara...","['John Wiley & Sons', 'File:Single Chromosome ...",['https://www.nimh.nih.gov/health/topics/autis...
2,39,Albedo,"['thumb|upright=1.3|The', 'percentage', 'diffu...",['File:water reflectivity.jpg'],['http://web.cse.ohio-state.edu/~parent.1/clas...
3,290,A,"['A', 'named', 'plural', 'As', 'A', ""'s"", ""'s""...","['File:Cretan-1.jpg', 'Aleph', 'File:Cretan-1....",['https://books.google.com/books?id=n2QWAAAAYA...
4,303,Alabama,"['Alabama', 'state', 'southeastern', 'region',...","['Natural Bridge, Alabama', 'AT&T Inc.', 'AT&T...",['http://www.oed.com/view/Entry/248152?redirec...
5,305,Achilles,"['thumb|300px|Ancient', 'Greek', 'polychromati...",['File:The Education of Achilles 1862 Delacroi...,['http://epigraphy.packhum.org/inscriptions/se...
6,307,Abraham Lincoln,"['Abraham', 'Lincoln', 'February', '12', '1809...","['Sarah Bush Lincoln', 'William Wallace Lincol...",['https://quod.lib.umich.edu/j/jala/2629860.00...
7,308,Aristotle,"['Aristotle', 'Aristotélēs', '384–322', 'BC', ...","['State of matter', 'File:Scyliorhinus retifer...",['https://books.google.com/?id=ZB-rVxPvtPEC&pg...
8,309,An American in Paris,"['Themes', 'An', 'American', 'Paris', 'An', 'A...","['University of Michigan School of Music, Thea...",['http://www.kennedy-center.org/calendar/?fuse...
9,316,Academy Award for Best Production Design,"['The', 'Academy', 'Award', 'Best', 'Productio...",['Pride and Prejudice (2005 film)'],"['http://awardsdatabase.oscars.org/', 'https:/..."


In [127]:
q = d.loc[0, 'text'][0]
# c.loc[0, 'text'][0]

In [128]:
q.strip('\'')

'Anarchism'

In [144]:
# d.loc[0, 'extlinks'] = [word.strip('\'') for word in d.loc[0, 'extlinks']]
# d.loc[0, 'extlinks']

type([word.strip('\'') for word in d.loc[0, 'extlinks']])
type(d.loc[0, 'extlinks'])

list

In [None]:
i = 0
for line in bz2.BZ2File(wiki_dump):
    print(line)
    i += 1
    if i > 2e+2: break

In [160]:
np.random.choice([1, 2, 3])

3

In [161]:
np.random.choice([1, 2, 3])

2

In [138]:
d = []
f = open("clean-data/enwiki-20190220-pages-articles-multistream1.xml-p10p30302.bz2.csv", "r", encoding='utf-8')
i = 0
for line in f:
    if len(line.strip()) > 0:
        d.append(line.strip().split(','))
        if i > 100: break
        else: i += 1
len(d)

102

In [158]:
t = d[5]
t[int(t[2])-5:int(t[2])], t[int(t[2]):]

(['Category', 'LGBT', 'themes', 'Greek', 'mythology'],
 ['File:The Education of Achilles 1862 Delacroix.jpg',
  'wikt:Ἀχιλλεύς',
  'File:Peter Paul Rubens 181.jpg',
  'File:Tumulus of Achilles and sacrifice of Polyxena.jpg'])

In [151]:
m = [1,1,1,1]
m.insert(1, 2)
m

[1, 2, 1, 1, 1]

In [170]:
d[0][3], len(d[0][int(d[0][2]):]), len(d[0][:-int(d[0][3])])

('6', 10, 10456)

In [129]:
z = a[0]
x = []
x.append(z[0])
x.append(z[1])
x.append(3+len(z[2]))
x.extend(z[2])
x.extend(z[3])
len(x[x[2]:]), z[3]

(6,
 ['File:WilliamGodwin.jpg',
  'File:Bakunin.png',
  'Cantonal Revolution',
  'File:Lugi Gallean2.jpg',
  'File:ParcGuellOkupas.jpg',
  'Popular Indigenous Council of Oaxaca "Ricardo Flores Magón"'])

In [136]:
a[0][3], a[1][3], a[2][3], a[3][3], a[4][3], a[5][3], a[6][3], a[7][3], a[8][3], a[9][3], 

(['File:WilliamGodwin.jpg',
  'File:Bakunin.png',
  'Cantonal Revolution',
  'File:Lugi Gallean2.jpg',
  'File:ParcGuellOkupas.jpg',
  'Popular Indigenous Council of Oaxaca "Ricardo Flores Magón"'],
 ['John Wiley & Sons',
  'File:Single Chromosome Mutations.svg',
  'Vaccine controversy#Vaccine overload',
  'File:Autismbrain.jpg',
  'File:Powell2004Fig1A.jpeg',
  'File:Opening a window to the autistic brain.jpg',
  'File:Victor of Aveyron, 1800.jpg'],
 ['File:water reflectivity.jpg'],
 ['File:Cretan-1.jpg',
  'Aleph',
  'File:Cretan-1.jpg',
  'File:Semitic-A2.jpg',
  'File:Cretan-2.jpg',
  'File:Phoenician1a.jpg',
  'File:Semitic-2.jpg'],
 ['Natural Bridge, Alabama',
  'AT&T Inc.',
  'AT&T Inc.',
  'Brasfield & Gorrie',
  'BE&K',
  'U.S. News & World Report',
  'Interstate 65 in Alabama',
  'U.S. Route 11 in Alabama',
  'R.E."Bob" Woodruff Lake'],
 ['File:The Education of Achilles 1862 Delacroix.jpg',
  'wikt:Ἀχιλλεύς',
  'File:Peter Paul Rubens 181.jpg',
  'File:Tumulus of Achilles and

In [133]:
t[3:]

['Anarchism',
 'anti-authoritarian',
 'political',
 'philosophy',
 'advocates',
 'self-governed',
 'societies',
 'based',
 'voluntary',
 'cooperative',
 'institutions',
 'rejection',
 'hierarchies',
 'societies',
 'view',
 'unjust',
 'These',
 'institutions',
 'often',
 'described',
 'stateless',
 'societies',
 'In',
 'society',
 'developed',
 'lines',
 'voluntary',
 'associations',
 'already',
 'begin',
 'cover',
 'fields',
 'human',
 'activity',
 'would',
 'take',
 'still',
 'greater',
 'extension',
 'substitute',
 'state',
 'functions',
 'although',
 'several',
 'authors',
 'defined',
 'specifically',
 'distinct',
 'institutions',
 'based',
 'non-hierarchical',
 'free',
 'associations',
 'Anarchism',
 'holds',
 'capitalism',
 'state',
 'representative',
 'democracy',
 'undesirable',
 'unnecessary',
 'harmful',
 'While',
 'opposition',
 'state',
 'central',
 'Anarchists',
 'reject',
 'state',
 'see',
 'But',
 'claim',
 'central',
 'aspect',
 'anarchism',
 'definitive',
 'sell',
 'ana

In [132]:
z[2]

['Anarchism',
 'anti-authoritarian',
 'political',
 'philosophy',
 'advocates',
 'self-governed',
 'societies',
 'based',
 'voluntary',
 'cooperative',
 'institutions',
 'rejection',
 'hierarchies',
 'societies',
 'view',
 'unjust',
 'These',
 'institutions',
 'often',
 'described',
 'stateless',
 'societies',
 'In',
 'society',
 'developed',
 'lines',
 'voluntary',
 'associations',
 'already',
 'begin',
 'cover',
 'fields',
 'human',
 'activity',
 'would',
 'take',
 'still',
 'greater',
 'extension',
 'substitute',
 'state',
 'functions',
 'although',
 'several',
 'authors',
 'defined',
 'specifically',
 'distinct',
 'institutions',
 'based',
 'non-hierarchical',
 'free',
 'associations',
 'Anarchism',
 'holds',
 'capitalism',
 'state',
 'representative',
 'democracy',
 'undesirable',
 'unnecessary',
 'harmful',
 'While',
 'opposition',
 'state',
 'central',
 'Anarchists',
 'reject',
 'state',
 'see',
 'But',
 'claim',
 'central',
 'aspect',
 'anarchism',
 'definitive',
 'sell',
 'ana