In [None]:
# 1. STEP
# In this Notebook, Data from DBLP ('dblp.xml') is parsed and cleaned.
# The resulting XML file is saved as 'clean_dblp.xml'

In [None]:
# >>> Parse the DBLP XML File

# Tutorial on parsing an XML file: 
# "Processing XML in Python — ElementTree. A Beginner’s Guide." by Deepesh Nair, Sep 15, 2018,
# Published at https://towardsdatascience.com/processing-xml-in-python-elementtree-c8992941efd2 
# (Visited: 13.03.2019)

import datetime
import xml.etree.ElementTree as ET
parser = ET.XMLParser(encoding='ASCII')

# Problem: The DBLP XML is in ascii, so when parsing using ET without setting up the parser correctly, 
# I get "undefined entity" errors.
# Solution: I decode some special characters and letters from ISO-8859-1
# Hint to solution: https://stackoverflow.com/questions/22920295/parse-xhtml-document-with-undefined-entity
# parser.entity['ouml'] = 'Ö'
# Further documentation used: https://docs.python.org/3.3/library/xml.html
special_chars = {
    'Auml': 'Ä',
    'Euml': 'Ë',
    'Iuml': 'Ï',
    'Ouml': 'Ö',
    'Uuml': 'Ü',
    
    'auml': 'ä',
    'euml': 'ë',
    'iuml': 'i',
    'ouml': 'ö',
    'uuml': 'ü',
    'yuml': 'ÿ',
    
    'Aacute': 'Á',
    'Eacute': 'É',
    'Iacute': 'Í',
    'Oacute': 'Ó',
    'Uacute': 'Ú',
    
    'Yacute': 'Ý',
    'aacute': 'á',
    'eacute': 'é',
    'iacute': 'í',
    'oacute': 'ó',
    'uacute': 'ú',
    'yacute': 'ý',
    
    'Agrave': 'À',
    'Egrave': 'È',
    'Igrave': 'Ì',
    'Ograve': 'Ò',
    'Ugrave': 'Ù',
    
    'agrave': 'à',
    'egrave': 'è',
    'igrave': 'ì',
    'ograve': 'ò',
    'ugrave': 'ù',
    
    'szlig': 'ß',
    
    'Atilde': 'Ã',
    'Ntilde': 'Ñ',
    'Otilde': 'Õ',
    
    'atilde': 'ã',
    'ntilde': 'ñ',
    'otilde': 'õ',
    
    'Ccedil': 'Ç',
    'ccedil': 'ç',
    
    'Acirc': 'Â',
    'Ecirc': 'Ê',
    'Icirc': 'Î',
    'Ocirc': 'Ô',
    'Ucirc': 'Û',
    
    'acirc': 'â',
    'ecirc': 'ê',
    'icirc': 'î',
    'ocirc': 'ô',
    'ucirc': 'û',
    
    'AElig': 'Æ',
    'aelig': 'æ',
    
    'Aring': 'Å',
    'aring': 'å',
    
    'Oslash': 'Ø',
    'oslash': 'ø',
    
    'ETH': 'Ð',
    'eth': 'ð',
    
    'thorn': 'þ',
    'THORN': 'Þ',
    
    'micro': 'µ',
    'times': '×',
    'reg': '®'
}

for key, val in special_chars.items():
    parser.entity[key] = val
    
file = 'data/dblp-2019-04-01.xml'
time = datetime.datetime.now()
print("Starting to parse XML file at {} ...".format(time))
tree = ET.parse(file, parser=parser)
time = datetime.datetime.now()
print("Finished parsing XML file at {} ! ".format(time))
root = tree.getroot()
print("Found {} entries! ".format(len(root)))

In [None]:
# >>> Remove unneeded data
# https://dblp.org/faq/16154937.html

print("Starting to clean up data on publications...")

tree_copy = tree
root_copy = root

children_to_remove = []

In [None]:
# Remove publications we are not interested in: www, it contains mostly authors, but not all authors
# https://dblp.org/faq/1474690.html

print("Searching for author entries to remove...")

for child in root:
    if(child.tag == "www"):
        children_to_remove.append(child)

In [None]:
# Remove all informal publications, suveys, data, software
# This is commented out for now, since it will first be included and then filtered later on
'''
print("Searching for publication entries to remove...")

categories_to_remove = ["survey", "data", "software"]

for child in root:
    pt = child.get("publtype")
    if(pt is not None and pt in categories_to_remove):
        children_to_remove.append(child)
'''

In [None]:
print("Found {} elements to remove. Continuing to remove... ".format(len(children_to_remove)))

In [None]:
import sys

c = 0

for child in children_to_remove:
    try:
        root.remove(child)
        c+=1
        if(c % 1000 == 0):
            print("Removed {} entries in total.".format(c))
    except:
        print(sys.exc_info()[0])
        
print("Removed all entries marked for removal. {} entries left.".format(len(root)))

In [None]:
# >>> Save cleaned XML

print("Preparing to save... ")

# Create file
text_file = open("data/clean_dblp.xml", "w")
text_file.write("")
text_file.close()

print("Ready to save.")

In [None]:
# Save to 'clean_dblp.xml'
print("Saving... ")
tree.write("data/clean_dblp.xml", xml_declaration="UTF-8", method="xml")
print("Saved XML in file \"data/clean_dblp.xml\".")