In [1]:
# Imports
import bz2
from collections import defaultdict
import json
import os
import re
import time
import yaml

import mwparserfromhell
import mwxml

ModuleNotFoundError: No module named 'mwparserfromhell'

In [None]:
# These are to generate the labels for the dataset.
# See https://www.mediawiki.org/wiki/ORES#Topic_interest_mapping for more explanation.

def generate_wp_to_labels(wp_taxonomy):
    wp_to_labels = defaultdict(set)
    for wikiproject_name, label in _invert_wp_taxonomy(wp_taxonomy):
        wp_to_labels[wikiproject_name].add(label)
    return wp_to_labels


def _invert_wp_taxonomy(wp_taxonomy, path=None):
    catch_all = None
    catch_all_wikiprojects = []
    for key, value in wp_taxonomy.items():
        path_keys = (path or []) + [key]
        if key[-1] == "*":
            # this is a catch-all
            catch_all = path_keys
            catch_all_wikiprojects.extend(value)
            continue
        elif isinstance(value, list):
            catch_all_wikiprojects.extend(value)
            for wikiproject_name in value:
                yield wikiproject_name, ".".join(path_keys)
        else:
            yield from _invert_wp_taxonomy(value, path=path_keys)
    if catch_all is not None:
        for wikiproject_name in catch_all_wikiprojects:
            yield wikiproject_name, ".".join(catch_all)

In [None]:
def get_wikitext(dump_fn, titles=None, print_every=10000):
    """Loop through dump file and yield article text."""
    dump = mwxml.Dump.from_file(bz2.open(dump_fn, 'rt'))
    found = 0
    processed = 0
    start = time.time()
    for page in dump:
        if page.namespace == 0 and page.redirect is None:  # article namespace and not redirect
            processed += 1
            if titles is None or page.title in titles:  # talk pages
                found += 1
                rev = next(page)
                yield page.title, rev.text
            if processed % print_every == 0:
                print("{0} articles processed. {1} retained. {2:.1f} seconds passed.".format(processed, found, time.time() - start))


    print("Complete: {0} articles processed. {1} retained. {2:.1f} seconds passed.".format(processed, found, time.time() - start))

In [None]:
# Note: this may need to be updated with language-specific patterns -- e.g., alternative spellings of Category
common_forbidden_patterns =  [
    "<!--.*?-->",  # this would remove any commented-out text, which is rare but sometimes substantial
    "\[\[category:.*?\]\]", # remove categories
    "{{.*?}}", # remove template text
    "&amp;",
    "&lt;",
    "&gt;",
    r"<ref[^<]*<\/ref>",  # remove references
    "<[^>]*>",
    "\|left",
    "\|\d+px",
    r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
    "\|thumb",
    "\|right",
    "\[\[image:[^\[\]]*",
    "\[\[category:([^|\]]*)[^]]*\]\]",
    "\[\[[a-z\-]*:[^\]]*\]\]",
    "\[",
    "\]",
    "\{[^\}]*\}",
    r"\n",
    " +",
    r"[^a-zA-Z0-9 ]",
    r"\b[a-zA-Z]\b"
]

compiled_patterns = []
for pattern in common_forbidden_patterns:
    compiled_patterns.append(re.compile(pattern))