In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re
import gzip
import bz2

In [2]:
PATH_WIKI_XML = '/media/andrii/earth/Katia/CS_MasterThesis/data/enwiki/en_xml_dump/'
PATH_WIKI_OUT = '/media/andrii/earth/Katia/CS_MasterThesis/data/enwiki/en_parsed/'

In [3]:
WIKI_FILENAMES = []
for file in os.listdir(PATH_WIKI_XML):
    if file.endswith(".bz2"):
        WIKI_FILENAMES.append(file)

In [4]:
WIKI_FILENAMES

['enwiki-20180920-pages-meta-current4.xml-p200511p352689.bz2',
 'enwiki-20180920-pages-meta-current25.xml-p35452816p36952816.bz2',
 'enwiki-20180920-pages-meta-current17.xml-p11539268p13039268.bz2',
 'enwiki-20180920-pages-meta-current6.xml-p565314p892912.bz2',
 'enwiki-20180920-pages-meta-current16.xml-p11018050p11539266.bz2',
 'enwiki-20180920-pages-meta-current27.xml-p57663462p58534506.bz2',
 'enwiki-20180920-pages-meta-current25.xml-p36952816p38067202.bz2',
 'enwiki-20180920-pages-meta-current25.xml-p33952816p35452816.bz2',
 'enwiki-20180920-pages-meta-current22.xml-p25427984p26823660.bz2',
 'enwiki-20180920-pages-meta-current11.xml-p3046514p3926861.bz2',
 'enwiki-20180920-pages-meta-current13.xml-p5040438p6197594.bz2',
 'enwiki-20180920-pages-meta-current26.xml-p39567203p41067203.bz2',
 'enwiki-20180920-pages-meta-current17.xml-p13039268p13693071.bz2',
 'enwiki-20180920-pages-meta-current20.xml-p20254736p21222156.bz2',
 'enwiki-20180920-pages-meta-current5.xml-p352690p565313.bz2',

In [5]:
len(WIKI_FILENAMES)

55

In [6]:
def unpack(file_name):
    file_name_new = file_name.replace(".bz2","")
    with bz2.open(file_name, 'rb') as f_in, open(file_name_new, 'wb') as f_out:
        f_out.writelines(f_in)
    return file_name_new

In [7]:
def pack_and_remove(file_name):
    file_name_new = file_name+'.bz2'
    with open(file_name, 'rb') as f_in, bz2.open(file_name_new, 'wb') as f_out:
        f_out.writelines(f_in)
    os.remove(file_name)
    return file_name_new

In [8]:
ENCODING = "utf-8"

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t


totalCount = 0
articleCount = 0
redirectCount = 0
total_article_text_len = 0

In [9]:
start_time = time.time()
regex_links = re.compile(r"\[\[(?P<article>(?!.*?\:).*?)(?:\{\{.*\}\})?(?:\|(?P<text>(?!.*?\:).*?))?\]\]")
for WikiXML in WIKI_FILENAMES:
    pathWikiXML = os.path.join(PATH_WIKI_XML, WikiXML)
    pathWikiXML = unpack(pathWikiXML)
    pathArticles = os.path.join(PATH_WIKI_OUT, WikiXML+"_art.csv")
    pathArticlesRedirect = os.path.join(PATH_WIKI_OUT, WikiXML+"_red.csv")
    with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
    codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH:
        articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
        articlesWriter.writerow(['id', 'title', 'text_len', 'link_pos', 'link_val', 'link_txt']) 
        redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
        redirectWriter.writerow(['id', 'title', 'redirect'])
        for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
            tname = strip_tag_name(elem.tag)

            if event == 'start':
                if tname == 'page':
                    title = ''
                    id = -1
                    redirect = ''
                    inrevision = False
                    ns = 0
                    article_text_len = 0
                    links = []
                elif tname == 'revision':
                    # Do not pick up on revision id's
                    inrevision = True
            else:
                if tname == 'title':
                    title = elem.text
                elif tname == 'id' and not inrevision:
                    id = int(elem.text)
                elif tname == 'redirect':
                    redirect = elem.attrib['title']
                elif tname == 'ns':
                    ns = int(elem.text)
                elif tname == 'page' and ns == 0:
                    totalCount += 1
                    total_article_text_len += article_text_len                        
                    if len(redirect) == 0:
                        articleCount += 1
                        if len(links) == 0:
                            articlesWriter.writerow([id, title, article_text_len, 0, "", ""])
                        for link in links:
                            articlesWriter.writerow([id, title, article_text_len, link[0], link[1], link[2]])
                    else:
                        redirectCount += 1
                        redirectWriter.writerow([id, title, redirect])

                    if totalCount > 1 and (totalCount % 100000) == 0:
                        print("{:,}".format(totalCount))
                elif tname == 'text' and elem.text != None:
                    article_text_len = len(elem.text)
                    for match in regex_links.finditer(elem.text):    
                        link_pos = match.start()
                        link_title = match.group("article")
                        link_title = link_title.replace("&nbsp;", " ")
                        link_title = link_title.replace("&ndash;", "-")
                        link_title = link_title.replace("&mdash;", "—")
                        link_title = link_title.replace("%20", " ")
                        
                        link_text = match.group("text")
                        links.append((link_pos,link_title,link_text))


                elem.clear()
    
    fn_in = pathArticles
    with open(fn_in, 'rb') as f_in, gzip.open(fn_in+'.gz', 'wb') as f_out:
        f_out.writelines(f_in)
    os.remove(fn_in)
    
    fn_in = pathArticlesRedirect
    with open(fn_in, 'rb') as f_in, gzip.open(fn_in+'.gz', 'wb') as f_out:
        f_out.writelines(f_in)
    os.remove(fn_in)
    
    elapsed_time = time.time() - start_time
    print("File processed: {}".format(WikiXML))
    print("Total pages: {:,}".format(totalCount))
    print("Article pages: {:,}".format(articleCount))
    print("Redirect pages: {:,}".format(redirectCount))
    print("Total article lenght: {:,}".format(total_article_text_len))
    print("Elapsed time: {}".format(hms_string(elapsed_time)))

elapsed_time = time.time() - start_time

print("Total pages: {:,}".format(totalCount))
print("Article pages: {:,}".format(articleCount))
print("Redirect pages: {:,}".format(redirectCount))
print("Total article lenght: {:,}".format(total_article_text_len))
print("Elapsed time: {}".format(hms_string(elapsed_time)))


File processed: enwiki-20180920-pages-meta-current4.xml-p200511p352689.bz2
Total pages: 83,431
Article pages: 49,105
Redirect pages: 34,326
Total article lenght: 901,000,754
Elapsed time: 0:02:36.78
100,000
200,000
300,000
400,000
File processed: enwiki-20180920-pages-meta-current25.xml-p35452816p36952816.bz2
Total pages: 447,965
Article pages: 200,231
Redirect pages: 247,734
Total article lenght: 1,753,971,854
Elapsed time: 0:09:21.83
500,000
600,000
700,000
800,000
File processed: enwiki-20180920-pages-meta-current17.xml-p11539268p13039268.bz2
Total pages: 834,646
Article pages: 372,737
Redirect pages: 461,909
Total article lenght: 2,696,607,618
Elapsed time: 0:15:39.40
900,000
1,000,000
File processed: enwiki-20180920-pages-meta-current6.xml-p565314p892912.bz2
Total pages: 1,011,646
Article pages: 450,906
Redirect pages: 560,740
Total article lenght: 3,732,392,156
Elapsed time: 0:18:59.53
1,100,000
File processed: enwiki-20180920-pages-meta-current16.xml-p11018050p11539266.bz2
Total

8,600,000
8,700,000
8,800,000
8,900,000
File processed: enwiki-20180920-pages-meta-current27.xml-p53163462p54663462.bz2
Total pages: 8,917,101
Article pages: 3,754,546
Redirect pages: 5,162,555
Total article lenght: 27,304,130,278
Elapsed time: 2:57:17.77
9,000,000
9,100,000
9,200,000
File processed: enwiki-20180920-pages-meta-current21.xml-p22722158p23927983.bz2
Total pages: 9,240,452
Article pages: 3,873,207
Redirect pages: 5,367,245
Total article lenght: 27,998,694,785
Elapsed time: 3:02:52.56
9,300,000
9,400,000
9,500,000
9,600,000
File processed: enwiki-20180920-pages-meta-current26.xml-p38067203p39567203.bz2
Total pages: 9,603,508
Article pages: 4,004,693
Redirect pages: 5,598,815
Total article lenght: 28,822,019,304
Elapsed time: 3:09:31.25
9,700,000
9,800,000
9,900,000
File processed: enwiki-20180920-pages-meta-current27.xml-p54663462p56163462.bz2
Total pages: 9,956,231
Article pages: 4,107,571
Redirect pages: 5,848,660
Total article lenght: 29,479,786,702
Elapsed time: 3:15:39