# Requirments

In [None]:
!pip install regex==2019.4.14
!pip install spacy==2.1.3
!pip install editdistance==0.5.3
!pip install requests==2.22.0
!pip install StringBuilder==1.0.0a4
!pip install clint==0.5.1

# Download the dumps

In [10]:
import os
import requests
from clint.textui import progress
def download_file(url):
    local_filename = url.split('/')[-1]
    response = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        total_length = int(response.headers.get('content-length'))
        for chunk in progress.bar(response.iter_content(chunk_size=512),
                                  expected_size=(total_length/512) + 1):
            if chunk:
                f.write(chunk)
    return local_filename

In [12]:
# for ZH:
#infile = 'https://dumps.wikimedia.org/zhwiki/20210601/zhwiki-20210601-pages-meta-current.xml.bz2'
# for BG:
infile = 'https://dumps.wikimedia.org/bgwiki/20210601/bgwiki-20210601-pages-meta-current.xml.bz2'



infile = infile.rstrip()
fname = os.path.basename(infile) # get the basename of urls (the link after https://dumps.wikimedia.org/)
if not os.path.exists(fname):
    print("Downloading file : " + fname)
    download_file(infile)
    print("Download finished!")
else:
    print("File " + fname + " already downloaded!")
print("Filtering...")

Downloading file : bgwiki-20210601-pages-meta-current.xml.bz2
Download finished!
Filtering...


# Filtering

Define the functions for extraction

In [22]:
import logging
import io
import xml.etree.ElementTree as ET
import re
import os
import bz2
from StringBuilder import StringBuilder

class POVProcessor(object):
    """
    Process a full revision history for a wikipedia page from the database dump and extract the revision tagged with
    the list found in the tag file.
    """

    default_tags = 'NPOV\nEditorial'

    def __init__(self, enc: str, output_file: str, logfile: str):
        # if os.path.exists(os.path.join(os.getcwd(), tags_file)):
        #     with open(os.path.join(os.getcwd(), tags_file), encoding=enc) as f:
        #         tags = [re.escape(tag) for tag in f.read().strip().split('\n')]
        # else:
        #     print(f'{tags_file} not found, using default tags' )
        #     tags = [re.escape(tag) for tag in POVProcessor.default_tags.split('\n')]
        # self.regex = r'(?i){{((' + r'|'.join(tags) + r'))(\|[^}]+)?}}' # FIXME: why ((..))
        # self.match = lambda x: re.search(self.regex, str(x.find('text').text))

        self.output = open(output_file, 'w', encoding=enc)
        self.output_tags = open(output_file + '.alltags.txt', 'w', encoding=enc)
        self.enc = enc
        self.logit = False
        if logfile:
            logging.basicConfig(level=logging.INFO, handlers=[logging.FileHandler(logfile, 'w+', 'utf-8')])
            self.logit = True

    def normalize_ws(self, text: str):
        return re.sub(r'\s+', ' ', text).strip()

    def extract(self, page: str):
        if self.logit:
            logging.info("Page : " + page[page.find("<title>") + 7:page.find("</title>")])
        article = None
        id = None

        # Look for all tags
        # for x in re.finditer(self.all_tags, page):
        #     pos = page.find('|', x.span()[0]+2, x.span()[1] - x.span()[0] - 4)
        #     tg = page[x.span()[0]+2:x.span()[0]+2 + max(pos, min(self.tag_size_limit, x.span()[1] - x.span()[0] - 4))]
        #     tg = tg.lower().strip() .replace('\n', ' ')
        #     if tg not in self.tags:
        #         self.tags[tg] = 0
        #     self.tags[tg] = self.tags[tg] + 1

        # Based on http://effbot.org/elementtree/iterparse.htm
        context = ET.iterparse(io.StringIO(page), events=("start", "end"))
        context = iter(context)
        _, root = context.__next__()

        for event, elem in context:
            if event == 'end':
                if elem.tag == 'page' and elem.find('id').text:
                        id = elem.find('id').text
                        article = self.normalize_ws(str(elem.find('revision').find('text').text)) or ''
                        self.output.write(id + '\t' + article.replace('\t', ' ').replace('\n', ' ') + '\n')
                        root.clear()

set the parameters

In [25]:
enc = "UTF-8"
report_freq = 1000
lang = 'en'
outputfile = 'revisions1.txt'
pov = POVProcessor(enc, outputfile, fname + ".log")
cptPage = 0
zip = bz2.BZ2File(fname)
store = False
fullpage = StringBuilder()

extract all current Wikipedia pages

In [26]:
for line in zip:
    line = line.decode(enc)

    if store:
        fullpage.append(line)
        if line == "  </page>\n":
            
            # Process it
            
            pov.extract(fullpage.to_string())
            fullpage = StringBuilder()
            store = False
            cptPage += 1
            if cptPage % report_freq == 0:
                print(str(cptPage) + " pages processed...")

        elif line.startswith("    <ns>") and not line == "    <ns>0</ns>\n":
            # Other types of pages
            fullpage = StringBuilder()
            store = False

    elif line == "  <page>\n":
        store = True
        fullpage = StringBuilder()
        fullpage.append(line)

1000 pages processed...
2000 pages processed...
3000 pages processed...
4000 pages processed...
5000 pages processed...
6000 pages processed...
7000 pages processed...
8000 pages processed...
9000 pages processed...
10000 pages processed...
11000 pages processed...
12000 pages processed...
13000 pages processed...
14000 pages processed...
15000 pages processed...
16000 pages processed...
17000 pages processed...
18000 pages processed...
19000 pages processed...
20000 pages processed...
21000 pages processed...
22000 pages processed...
23000 pages processed...
24000 pages processed...
25000 pages processed...
26000 pages processed...
27000 pages processed...
28000 pages processed...
29000 pages processed...
30000 pages processed...
31000 pages processed...
32000 pages processed...
33000 pages processed...
34000 pages processed...
35000 pages processed...
36000 pages processed...
37000 pages processed...
38000 pages processed...
39000 pages processed...
40000 pages processed...
41000 pag

# Sentencizer and pre-process the articles

Define the functions for sentencizer

In [28]:
import spacy
import argparse

if lang is not 'zh':
    nlp = spacy.blank(lang)
    nlp.max_length = 2000000
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)


def sbd(string):
    """
    Takes a string and returns a list of sentences
    longer than 5 tokens.
    string: param
    """
    if lang == 'zh':
        sentences = re.split('(。|！|\!|\.|？|\?)',string)
        return [sent for sent in sentences if 10 < len(sent) < 300]
    else:
        doc = nlp(string)
        doc.is_parsed = True
        return [str(sent) for sent in list(doc.sents) if 5 < len(sent) < 300]

Define the functions for pre-preprocessing (data cleanning)

In [29]:
import re
import regex
import argparse
# from utils import check_lang
def clean_wiki(text):
    text = re.sub(r"''+", '', text)  # wiki bold and double quotes
    text = re.sub(r'==+.+?==+', '', text)  # wiki titles
    text = re.sub(r'<ref[^<]*<\/ref>', '', text)  # remove references <ref...> ... </ref>
    text = re.sub(r'<!--[\s\S\n]*?-->', '', text)  # html comments
    text = re.sub(r'<([A-Z][A-Z0-9]*)\b[^>]*>(.*?)</\1>', '', text)  # <tag>...</tag>
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'<[^>|<]*/>', '', text)  # self-closing html tags
    text = re.sub(r'\[http:[^] ]*', '[', text)  # remove normal url, preserve visible text
    text = re.sub(r'\[\[[^\|\]]*\|', '[[', text)  # remove wiki url, preserve visible text
    text = re.sub(r'\[\[[^\]]*:[^\]|\|]*\]\]', '', text)  # remove links to other languages & categories
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', '', text)  # remove bare URLs
    text = regex.sub(r'\{\{((?>[^{}]+|(?R))*)\}\}', '', text)  # remove embedded {{icons and comments}} recursively
    text = re.sub(r'\{[^\}]*\}', '', text)  # remove {tables}
    text = re.sub(r'\[[^\[]*\|[^\]]*', '', text)  # remove [image|titles|& other tags]
    text = re.sub(r'\}|\{', '', text)  # trailing {}
    text = re.sub(r'\]|\[', '', text)  # trailing []
    text = re.sub(r'<[^>]*>', '', text)  # trailing HTML tags
    text = re.sub(r'&.{4,6};', '', text)  # HTML unicode characters
    text = re.sub(r'\s\*', '.', text)  # end list items with . and remove bullet points
    if lang.lower() == "bg":
        text = re.sub(r'\d+((,|\.)*\d+)*', 'НУМТКН ', text)  # replace numbers by a token in Cyrillic
        text = re.sub(r'[a-zA-Z]', '', text)  # remove latin script
    text = re.sub(r'\d+((,|\.)*\d+)*', 'NUMTKN ', text)  # replace numbers by a token
    text = re.sub(r'[. ]{3,}', '. ', text)  # clean multiple .
    text = re.sub(r'\s{2,}', ' ', text)  # clean multiple spaces
    text = re.sub(r'!{10,}', '', text)  # remove excessive exclamation points

    return text

def clean_punct(text):
    if lang.lower() == "fr":
        text = re.sub(r"[.,\/#!?†‡•$%‰\^&\*;:{}|=–\-_—‗‾⁄`~′″‴‵‶‷()‚‛“”„‟‹›№«»]", ' ', text) # keep U+0027, U+2018 and U+2019 (=apostrophies)
    elif lang.lower() == "zh":
        punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
        text = re.sub(r"[%s]+" %punc, ' ', text)
    #else:
    text = re.sub(r"[.,\/#!?†‡•$%‰\^&\*;:{}|=–\-_—‗‾⁄`~′″‴‵‶‷()‘’‚‛“”„‟‹›'№]", ' ', text)  # remove punctuation and symbols
    text = re.sub(r'"', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)  # clean multiple spaces
    return text.strip()

def clean_no_chinese(text):
      chinese_term = re.findall(r'[\u4e00-\u9fff]+', text)    
      if chinese_term == []:
          return ''
      elif len(''.join(chinese_term)) <= 10:
          return ''
      return text

Define the pipeline for pre-processing

In [30]:
def preprocess(revision):
    rev = dict()
    try:
        rev['id'], rev['text'] = revision.strip().split('\t')
        return rev
    except ValueError:
        return None

def clean_text(dico):
    rev = dict()
    rev['id'] = dico['id']
    rev['text'] = [sent for sent in sbd(clean_wiki(dico['text']))]
    return rev

def punct_diff(dico):
    rev = dict()
    rev['id'] = dico['id']
    # further process the before and after state of the revision
    rev['text'] = [clean_punct(sent).lower() for sent in dico['text']]
    # remove too short or too long sentences
    rev['text'] = filter_length(rev['text'])
    # cross check to get updated lists of rem/add sentences
    # remove the text if it does not contains Chinese
    if lang.lower() == 'zh':
      rev['text'] = [clean_no_chinese(sent) for sent in rev['text']]
    return rev

def filter_length(sents):
    return [sent for sent in sents if 10 < len(sent) < 300]

def process(input_file):   
    with open(input_file, "r") as file:
        for line in file:
            if preprocess(line) is None:
                stats['ignored, short'] += 1
                continue
            revs = preprocess(line)
            revs = clean_text(revs)
            revs = punct_diff(revs)
            yield revs

In [None]:
inputfile = 'revisions1.txt'
#outputfile = 'diff.to_pickle'
data = [d for d in process(inputfile)]
#to_pickle(data, diffs.pickle)

# Write the data to txt file

In [None]:
def to_text_file(data, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(data))

In [None]:
sents = []
for i in range(len(data)):
  for sent in data[i]['text']:
    if len(sent) >= 5:
      sents.append(sent + '\t' + data[i]['id'])

to_text_file(sents, lang.upper() + '.txt')