In [1]:
# Tutorial on parsing an XML file: 
# "Processing XML in Python — ElementTree. A Beginner’s Guide." by Deepesh Nair, Sep 15, 2018,
# Published at https://towardsdatascience.com/processing-xml-in-python-elementtree-c8992941efd2 
# (Visited: 13.03.2019)

# Problem: XML is in ascii, so when parsing, I get "undefined entity" errors.
# Hint to solution: https://stackoverflow.com/questions/22920295/parse-xhtml-document-with-undefined-entity
# parser.entity['ouml'] = 'Ö'
# Further documentation used: https://docs.python.org/3.3/library/xml.html

import datetime
import xml.etree.ElementTree as ET
parser = ET.XMLParser(encoding='ASCII')

# Because dblp.xml is encoded in ASCII, I decode some special characters and letters
special_chars = {
    'Auml': 'Ä',
    'Euml': 'Ë',
    'Iuml': 'Ï',
    'Ouml': 'Ö',
    'Uuml': 'Ü',
    
    'auml': 'ä',
    'euml': 'ë',
    'iuml': 'i',
    'ouml': 'ö',
    'uuml': 'ü',
    'yuml': 'ÿ',
    
    'Aacute': 'Á',
    'Eacute': 'É',
    'Iacute': 'Í',
    'Oacute': 'Ó',
    'Uacute': 'Ú',
    
    'Yacute': 'Ý',
    'aacute': 'á',
    'eacute': 'é',
    'iacute': 'í',
    'oacute': 'ó',
    'uacute': 'ú',
    'yacute': 'ý',
    
    'Agrave': 'À',
    'Egrave': 'È',
    'Igrave': 'Ì',
    'Ograve': 'Ò',
    'Ugrave': 'Ù',
    
    'agrave': 'à',
    'egrave': 'è',
    'igrave': 'ì',
    'ograve': 'ò',
    'ugrave': 'ù',
    
    'szlig': 'ß',
    
    'Atilde': 'Ã',
    'Ntilde': 'Ñ',
    'Otilde': 'Õ',
    
    'atilde': 'ã',
    'ntilde': 'ñ',
    'otilde': 'õ',
    
    'Ccedil': 'Ç',
    'ccedil': 'ç',
    
    'Acirc': 'Â',
    'Ecirc': 'Ê',
    'Icirc': 'Î',
    'Ocirc': 'Ô',
    'Ucirc': 'Û',
    
    'acirc': 'â',
    'ecirc': 'ê',
    'icirc': 'î',
    'ocirc': 'ô',
    'ucirc': 'û',
    
    'AElig': 'Æ',
    'aelig': 'æ',
    
    'Aring': 'Å',
    'aring': 'å',
    
    'Oslash': 'Ø',
    'oslash': 'ø',
    
    'ETH': 'Ð',
    'eth': 'ð',
    
    'thorn': 'þ',
    'THORN': 'Þ',
    
    'micro': 'µ',
    'times': '×',
    'reg': '®'
}

for key, val in special_chars.items():
    parser.entity[key] = val

file = 'data/dblp.xml'
time = datetime.datetime.now()
print("Starting to parse XML file at {} ...".format(time))
tree = ET.parse(file, parser=parser)
time = datetime.datetime.now()
print("Finished parsing XML file at {} !".format(time))
root = tree.getroot()

Starting to parse XML file at 2019-03-13 16:11:42.056032 ...
Finished parsing XML file at 2019-03-13 16:12:53.932315 !


In [None]:
#root.tag

In [None]:
#root.attrib

In [None]:
#for child in root:
#    print(child.tag, child.attrib)

In [2]:
# Get all authors

import csv

names_complete = []

print("Starting to extract author names...")

for child in root:
    res = child.findall("author")
    for name in res:
        names_complete.append(name.text)
        
print("Finished extracting author names! Found {} entries.".format(len(names_complete)))

Starting to extract author names...
Finished extracting author names! Found 15308210 names.


In [3]:
print("Removing duplicates...")
names = set(names_complete)
print("Removed duplocates. Found {} individual authors.".format(len(names)))

Removing duplicates...
Removed duplocates. Found 2268612 authors.


In [12]:
# Python Documentation on reading & Writing CSV https://docs.python.org/3.5/library/csv.html

print("Entering names in CSV... ")

with open('data/names.csv', 'w', newline='') as csvfile:
    fieldnames = ['name', 'gender', 'certainty']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for name in names:
        writer.writerow({
            'name': name, 
            'gender': '', 
            'certainty': '0.0'
        })

print("Entered all names in a CSV!")

Entering names in CSV... 
Entered all names in a CSV!
