# Wiki Dump Parser: wdp.py

For the kWiki project, the already existing "wiki dump parser" was adapted: https://github.com/Grasia/wiki-scripts/tree/master/wiki_dump_parser

Packages used:
  
**xml.parsers.expat:** https://docs.python.org/3/library/pyexpat.html  
**xyx** https://docs.python.org/3/library/sys.html  

In [None]:
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
  wiki_dump_parser.py
  Script to convert a xml mediawiki history dump to a csv file with readable useful data
for pandas processing.
  Copyright 2017-2019 Abel 'Akronix' Serrano Juste <akronix5@gmail.com>
"""

"""
   adapted for use in the "kWiki" project by 'Media Group4 / BDS 19, Sem5'
   Nicolas Koch, Moritz Schlögel, Moritz Wieser, Nina Pasku
"""

import xml.parsers.expat
import sys
import re

from pyrsistent import l

__version__ = '2.0.2'

Debug = False

csv_separator = "#"

def xml_to_csv(filename):

  ### BEGIN xmt_to_csv var declarations ###
  # Shared variables for parser subfunctions:
  ## output_csv, _current_tag, _parent
  ## page_id,page_title,revision_id,timestamp,contributor_id,contributor_name,bytes_var

  output_csv = None
  _parent = None
  _current_tag = ''
  page_id = page_title = revision_id = timestamp = comment = contributor_id = contributor_name = bytes_var = revtext = ''
  # bds: added comment and revtext

  def start_tag(tag, attrs):
    nonlocal output_csv, _current_tag, _parent
    nonlocal bytes_var

    _current_tag = tag

    if tag == 'text':
      if 'bytes' in attrs:
        bytes_var = attrs['bytes']
      else: # There's a 'deleted' flag or no info about bytes of the edition
        bytes_var = '1'
    elif tag == 'page' or tag == 'revision' or tag == 'contributor':
      _parent = tag

    if tag == 'upload':
      print("!! Warning: '<upload>' element not being handled", file=sys.stderr)

  def data_handler(data):
    nonlocal output_csv, _current_tag, _parent
    nonlocal page_id,page_title,revision_id,timestamp,comment,contributor_id,contributor_name,bytes_var,revtext
    toreplace = ['\n', '|']
    pattern = '[' +  ''.join(toreplace) +  ']'
  # bds: added comment and revtext and added regex-filter to handle in-text-characters

    if _current_tag == '': # Don't process blank "orphan" data between tags!!
      return

    if _parent:
      if _parent == 'page':
        if _current_tag == 'title':
          page_title = '|' + re.sub(pattern, '', data) + '|'
        elif _current_tag == 'id':
          page_id = data
          if Debug:
            print("Parsing page " + page_id )
      elif _parent == 'revision':
        if _current_tag == 'id':
          revision_id = data
        elif _current_tag == 'timestamp':
          timestamp = data
        elif _current_tag == 'comment':
          comment = '|' + re.sub(pattern, '', data) + '|'
        elif _current_tag == 'text':
          revtext = '|' + re.sub(pattern, '', data) + '|'
      elif _parent == 'contributor':
        if _current_tag == 'id':
          contributor_id = data
        elif _current_tag == 'username':
          contributor_name = '|' + re.sub(pattern, '', data) + '|'
        elif _current_tag == 'ip':
          contributor_id = '|' + re.sub(pattern, '', data) + '|'
        

  def end_tag(tag):
    nonlocal output_csv, _current_tag, _parent
    nonlocal page_id,page_title,revision_id,timestamp,comment,contributor_id,contributor_name,bytes_var,revtext


    def has_empty_field(l):
      field_empty = False;
      i = 0
      while (not field_empty and i<len(l)):
        field_empty = (l[i] == '');
        i = i + 1
      return field_empty


    # uploading one level of parent if any of these tags close
    if tag == 'page':
      _parent = None
    elif tag == 'revision':
      _parent = 'page'
    elif tag == 'contributor':
      _parent = 'revision'

    # print revision to revision output csv
    if tag == 'revision':

      revision_row = [page_id, page_title, revision_id, timestamp, comment, contributor_id, contributor_name, bytes_var, revtext]
      rev_row = [comment, contributor_name, revtext]

      # Do not print (skip) revisions that has any of the fields not available
      if not has_empty_field(revision_row):
        output_csv.write(csv_separator.join(revision_row) + '\n')
      elif has_empty_field(rev_row):
        output_csv.write(csv_separator.join(revision_row) + '\n')
      else:
        print("The following line has incomplete info and therefore it's been removed from the dataset:")
        print(revision_row)

      # Debug lines to standard output
      if Debug:
        print(csv_separator.join(revision_row))

      # Clearing data that has to be recalculated for every row:
      revision_id = timestamp = comment = contributor_id = contributor_name = bytes_var = revtext = ''

    _current_tag = '' # Very important!!! Otherwise blank "orphan" data between tags remain in _current_tag and trigger data_handler!! >:(


  ### BEGIN xml_to_csv body ###

  # Initializing xml parser
  parser = xml.parsers.expat.ParserCreate()
  input_file = open(filename, 'rb')

  parser.StartElementHandler = start_tag
  parser.EndElementHandler = end_tag
  parser.CharacterDataHandler = data_handler
  parser.buffer_text = True
  parser.buffer_size = 1024

  # writing header for output csv file
  output_csv = open(filename[0:-3]+"csv",'w', newline = '\n', encoding='utf8')
  output_csv.write(csv_separator.join(["page_id","page_title","revision_id","timestamp","comment","contributor_id","contributor_name","bytes","revtext"]))
  output_csv.write("\n")

  # Parsing xml and writting proccesed data to output csv
  print("Processing...")
  parser.ParseFile(input_file)
  print("Done processing")

  input_file.close()
  output_csv.close()

  return True


if __name__ == "__main__":
  if(len(sys.argv)) >= 2:
    print ('Dump files to process: {}'.format(sys.argv[1:]))
    for xmlfile in sys.argv[1:]:
      print("Starting to parse file " + xmlfile)
      if xml_to_csv(xmlfile):
        print("Data dump {} parsed succesfully".format(xmlfile))
  else:
    print("Error: Invalid number of arguments. Please specify one or more .xml file to parse", file=sys.stderr)

## Reading the File

For reading in with the parser, the files are required in decompressed XML format.   
Decompression is possible using various free tools, such as 7Zip.   
For illustration purposes and to avoid long processing times, only the decompressed file `enwiki-p1p857.xml` is used in the following.

The biggest challenge in adapting the parser was that it was not created for processing text content. It was therefore necessary to find a way to bypass (replace) special characters in such a way that the content meaning of the revision texts did not suffer.   
We therefore decided on the approach of using the hashtag as the CSV separator and replacing page breaks as well as tubes within the given quotation marks, which in this case were also tubes, with a space. 

To retrieve the parser, small changes also had to be made:    
For example, the CSV could not be read correctly if the parameter `engine=python` was not passed.   
Furthermore, due to the addition of the comments and the revision texts, the parameter `on_bad_lines=warn` had to be used instead of `=error` to avoid that lines without content in these columns would be omitted. 

The finished code to retrieve the parser then read: 

In [None]:
import wdp as parser
parser.xml_to_csv('enwiki-20220201-pages-meta-history1.xml-p1p857.xml')

Finally, the into a CSV file converted XML file is read in with the specified parameters and can now be further worked on as Pandas DataFrame. 

In [5]:
import pandas as pd
df = pd.read_csv('enwiki-p1p857.csv', quotechar='|', sep = '#', engine = 'python', on_bad_lines='warn')
df['timestamp'] = pd.to_datetime(df['timestamp'],format='%Y-%m-%dT%H:%M:%SZ')
df.head()

Unnamed: 0,page_id,page_title,revision_id,timestamp,comment,contributor_id,contributor_name,bytes,revtext
0,10,AccessibleComputing,233192,2001-01-21 02:12:21,*,99,RoseParks,124,This subject covers* AssistiveTechnology* Acce...
1,10,AccessibleComputing,862220,2002-02-25 15:43:11,Automated conversion,1226483,Conversion script,35,#REDIRECT [[Accessible Computing]]
2,10,AccessibleComputing,15898945,2003-04-25 22:18:38,Fixing redirect,7543,Ams80,34,#REDIRECT [[Accessible_computing]]
3,10,AccessibleComputing,56681914,2006-06-03 16:55:41,fix double redirect,516514,Nzd,36,#REDIRECT [[Computer accessibility]]
4,10,AccessibleComputing,74466685,2006-09-08 04:16:04,cat rd,750223,Rory096,57,#REDIRECT [[Computer accessibility]] {{R from ...
