## 1. Parse all links from Wikipedia xml dump

In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re
import gzip
import pandas as pd
import numpy as np

In [2]:
PATH_WIKI = '/media/andrii/earth/Katia/CS_MasterThesis/data/ukwiki/'

In [3]:
WIKI_FILENAMES = []
for file in os.listdir(PATH_WIKI):
    if file.endswith("pages-meta-current.xml"):
        WIKI_FILENAMES.append(file)
print(WIKI_FILENAMES)

['ukwiki-20180920-pages-meta-current.xml']


In [4]:
ENCODING = "utf-8"


# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t


totalCount = 0
articleCount = 0
redirectCount = 0
total_article_text_len = 0

In [7]:
### results in two csv.gz files - with articles and redirections

start_time = time.time()
regex_links = re.compile(r"\[\[(?P<article>(?!.*?\:).*?)(?:\{\{.*\}\})?(?:\|(?P<text>(?!.*?\:).*?))?\]\]")
for WikiXML in WIKI_FILENAMES:
    pathWikiXML = os.path.join(PATH_WIKI, WikiXML)
    pathArticles = os.path.join(PATH_WIKI, WikiXML+"_art.csv")
    pathArticlesRedirect = os.path.join(PATH_WIKI, WikiXML+"_red.csv")
    with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
    codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH:
        articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
        articlesWriter.writerow(['id', 'title', 'text_len', 'link_pos', 'link_val', 'link_txt']) 
        redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
        redirectWriter.writerow(['id', 'title', 'redirect'])
        for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
            tname = strip_tag_name(elem.tag)

            if event == 'start':
                if tname == 'page':
                    title = ''
                    id = -1
                    redirect = ''
                    inrevision = False
                    ns = 0
                    article_text_len = 0
                    links = []
                elif tname == 'revision':
                    # Do not pick up on revision id's
                    inrevision = True
            else:
                if tname == 'title':
                    title = elem.text
                elif tname == 'id' and not inrevision:
                    id = int(elem.text)
                elif tname == 'redirect':
                    redirect = elem.attrib['title']
                elif tname == 'ns':
                    ns = int(elem.text)
                elif tname == 'page' and ns == 0:
                    totalCount += 1
                        
                    if len(redirect) == 0:
                        articleCount += 1
                        total_article_text_len += article_text_len
                        if len(links) == 0:
                            articlesWriter.writerow([id, title, article_text_len, 0, "", ""])
                        for link in links:
                            articlesWriter.writerow([id, title, article_text_len, link[0], link[1], link[2]])
                    else:
                        redirectCount += 1
                        redirectWriter.writerow([id, title, redirect])

                    if totalCount > 1 and (totalCount % 100000) == 0:
                        print("{:,}".format(totalCount))
                elif tname == 'text' and elem.text != None:
                    article_text_len = len(elem.text)
                    for match in regex_links.finditer(elem.text):    
                        link_pos = match.start()
                        link_title = match.group("article")
                        link_title = link_title.replace("&nbsp;", " ")
                        link_title = link_title.replace("&ndash;", "-")
                        link_title = link_title.replace("&mdash;", "â€”")
                        link_title = link_title.replace("%20", " ")
                        
                        link_text = match.group("text")
                        links.append((link_pos,link_title,link_text))


                elem.clear()
    
    
    elapsed_time = time.time() - start_time
    print("File processed: {}".format(WikiXML))
    print("Total pages: {:,}".format(totalCount))
    print("Article pages: {:,}".format(articleCount))
    print("Redirect pages: {:,}".format(redirectCount))
    print("Total article lenght: {:,}".format(total_article_text_len))
    print("Elapsed time: {}".format(hms_string(elapsed_time)))

elapsed_time = time.time() - start_time

print("Total pages: {:,}".format(totalCount))
print("Article pages: {:,}".format(articleCount))
print("Redirect pages: {:,}".format(redirectCount))
print("Total article lenght: {:,}".format(total_article_text_len))
print("Elapsed time: {}".format(hms_string(elapsed_time)))

1,300,000
1,400,000
1,500,000
1,600,000
1,700,000
1,800,000
1,900,000
2,000,000
2,100,000
2,200,000
2,300,000
2,400,000
2,500,000
File processed: ukwiki-20180920-pages-meta-current.xml
Total pages: 2,545,520
Article pages: 1,638,424
Redirect pages: 907,096
Total article lenght: 7,759,082,170
Elapsed time: 0:08:34.82
Total pages: 2,545,520
Article pages: 1,638,424
Redirect pages: 907,096
Total article lenght: 7,759,082,170
Elapsed time: 0:08:34.82


In [6]:
# results in two files:
# 1. "ukwiki-20180920-pages-meta-current.xml_red.csv"
# 2. "ukwiki-20180920-pages-meta-current.xml_art.csv"