In [None]:
# Tutorial on parsing an XML file: 
# "Processing XML in Python — ElementTree. A Beginner’s Guide." by Deepesh Nair, Sep 15, 2018,
# Published at https://towardsdatascience.com/processing-xml-in-python-elementtree-c8992941efd2 
# (Visited: 13.03.2019)

# Problem: XML is in ascii, so when parsing, I get "undefined entity" errors.
# Hint to solution: https://stackoverflow.com/questions/22920295/parse-xhtml-document-with-undefined-entity
# parser.entity['ouml'] = 'Ö'
# Further documentation used: https://docs.python.org/3.3/library/xml.html

import datetime
import xml.etree.ElementTree as ET
parser = ET.XMLParser(encoding='ASCII')

# Because dblp.xml is encoded in ASCII, I decode some special characters and letters from ISO-8859-1
special_chars = {
    'Auml': 'Ä',
    'Euml': 'Ë',
    'Iuml': 'Ï',
    'Ouml': 'Ö',
    'Uuml': 'Ü',
    
    'auml': 'ä',
    'euml': 'ë',
    'iuml': 'i',
    'ouml': 'ö',
    'uuml': 'ü',
    'yuml': 'ÿ',
    
    'Aacute': 'Á',
    'Eacute': 'É',
    'Iacute': 'Í',
    'Oacute': 'Ó',
    'Uacute': 'Ú',
    
    'Yacute': 'Ý',
    'aacute': 'á',
    'eacute': 'é',
    'iacute': 'í',
    'oacute': 'ó',
    'uacute': 'ú',
    'yacute': 'ý',
    
    'Agrave': 'À',
    'Egrave': 'È',
    'Igrave': 'Ì',
    'Ograve': 'Ò',
    'Ugrave': 'Ù',
    
    'agrave': 'à',
    'egrave': 'è',
    'igrave': 'ì',
    'ograve': 'ò',
    'ugrave': 'ù',
    
    'szlig': 'ß',
    
    'Atilde': 'Ã',
    'Ntilde': 'Ñ',
    'Otilde': 'Õ',
    
    'atilde': 'ã',
    'ntilde': 'ñ',
    'otilde': 'õ',
    
    'Ccedil': 'Ç',
    'ccedil': 'ç',
    
    'Acirc': 'Â',
    'Ecirc': 'Ê',
    'Icirc': 'Î',
    'Ocirc': 'Ô',
    'Ucirc': 'Û',
    
    'acirc': 'â',
    'ecirc': 'ê',
    'icirc': 'î',
    'ocirc': 'ô',
    'ucirc': 'û',
    
    'AElig': 'Æ',
    'aelig': 'æ',
    
    'Aring': 'Å',
    'aring': 'å',
    
    'Oslash': 'Ø',
    'oslash': 'ø',
    
    'ETH': 'Ð',
    'eth': 'ð',
    
    'thorn': 'þ',
    'THORN': 'Þ',
    
    'micro': 'µ',
    'times': '×',
    'reg': '®'
}

for key, val in special_chars.items():
    parser.entity[key] = val
    
file = 'data/dblp.xml'
time = datetime.datetime.now()
print("Starting to parse XML file at {} ...".format(time))
tree = ET.parse(file, parser=parser)
time = datetime.datetime.now()
print("Finished parsing XML file at {} ! ".format(time))
root = tree.getroot()
print("Found {} entries! ".format(len(root)))

In [None]:
#root.tag

In [None]:
#root.attrib

In [None]:
#for child in root:
#    print(child.tag, child.attrib)

In [None]:
# Remove unneeded data
# https://dblp.org/faq/16154937.html

print("Starting to clean up data on publications...")

tree_copy = tree
root_copy = root

children_to_remove = []

In [None]:
# Remove publications we are not interested in: www, it contains mostly authors, but not all authors
# https://dblp.org/faq/1474690.html

print("Searching for author entries to remove...")

for child in root:
    if(child.tag == "www"):
        children_to_remove.append(child)

In [None]:
# Remove all informal publications, suveys, data, software

print("Searching for publication entries to remove...")

categories_to_remove = ["survey", "data", "software"]

for child in root:
    pt = child.get("publtype")
    if(pt is not None and pt in categories_to_remove):
        children_to_remove.append(child)

print("Found {} elements to remove. Continuing to remove... ".format(len(children_to_remove)))

In [None]:
import sys

c = 0

print(len(children_to_remove))

for child in children_to_remove:
    try:
        root.remove(child)
        c+=1
        if(c % 100 == 0):
            print("Removed 100 entries.")
    except:
        print(sys.exc_info()[0])
        
print("Removed all entries marked for removal. {} entries left.".format(len(root)))

In [None]:
text_file = open("data/clean_dblp.xml", "w")
text_file.write("")
text_file.close()

In [None]:
# Save the cleaned xml

tree.write("data/clean_dblp.xml", xml_declaration="UTF-8", method="xml")
print("Saved XML in file.")

In [None]:
# Get all authors

import csv

names_complete = {}

print("Starting to extract author names...")

for child in root:
    res = child.findall("author")
    for name in res:
        if (name.text in names_complete):
            names_complete[name.text] = names_complete[name.text] + 1
        else:
            names_complete[name.text] = 1
        
print("Finished extracting author names! Found {} entries.".format(len(names_complete)))

In [None]:
import pandas as pd

# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html
print("Converting dictionary of names to a dataframe...")
names = pd.DataFrame.from_dict(names_complete, orient='index', dtype=None, columns=["n_publs"])
print("Dataframe completed! Here are random entries:")

In [None]:
sample = names.sample(n=10)

print(sample)

In [None]:
print("Sorting names...")
srtd_n_publs = names.sort_values(by="n_publs", ascending=False)
print("Names sorted!")

In [None]:
print("The most publishing authors are ...")
# https://stackoverflow.com/questions/15006298/how-to-preview-a-part-of-a-large-pandas-dataframe-in-ipython-notebook
srtd_n_publs.head(10)

In [None]:
# Adding empty columns to dataframe
# https://stackoverflow.com/questions/16327055/how-to-add-an-empty-column-to-a-dataframe

names["gender"] = ""
names["score"] = ""

In [None]:
'''
# Python Documentation on reading & Writing CSV https://docs.python.org/3.5/library/csv.html

print("Entering names in CSV... ")

with open('data/names.csv', 'w', newline='') as csvfile:
    fieldnames = ['name', 'gender', 'certainty']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for name in names:
        writer.writerow({
            'name': name, 
            'gender': '', 
            'certainty': '0.0'
        })

print("Entered all names in a CSV!")
'''

In [None]:
print("Getting private key... ")
# Get private API Key for NamSor API v2 (contained in txt file)
key = ''

# Import personal key
with open("key.txt", "r") as file:
    key = file.read()

if(len(key) > 0):
    print("Got private key.")
else: 
    print("Could not find private key. Please check the file name and make sure you have an API key.")

In [None]:
# Trying out NamSor API v2 to get the gender of a name
# https://www.namsor.com/
# https://v2.namsor.com/NamSorAPIv2/apidoc.html
# using NamSor API v2 Python SDK
# https://github.com/namsor/namsor-python-sdk2
# licensed under GNU Affero General Public License v3.0

# Alternatives? https://genderize.io/ -> But only first name!

# Following script taken from https://github.com/namsor/namsor-python-sdk2 "Getting Started" 
# and adapted to keep key private and remove unnecessary lines.
# It tests the connection to the NamSor API

print("Testing NamSor API v2 connection...")

import openapi_client
from openapi_client.rest import ApiException

# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = key

In [None]:
# create an instance of the API class
admin_api_instance = openapi_client.AdminApi(openapi_client.ApiClient(configuration))

try:
    # Print current API usage.
    api_response = admin_api_instance.api_usage()
    print(api_response)
    print("NamSor API v2 connection successfull!")
except ApiException as e:
    print("Exception when calling AdminApi->api_usage: %s\n" % e)

In [None]:
print("Getting gender of a name for testing... ")

import random

randomInt = randint(0, len(names))
testname = random.sample(names, 1)

print("Chose to test {}. Continuing...".format(testname))

In [None]:
# create an instance of the API class
pers_api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

In [None]:
try:
    api_response = pers_api_instance.gender_full(testname)
    print(api_response)
    print(type(api_response))
except ApiException as e:
    print("Exception when calling AdminApi->api_usage: %s\n" % e)

print("Name {} is {} with a chance of {}.".format(testname, api_response.likely_gender, abs(api_response.gender_scale)))

In [None]:
# Testing the API for a batch of names
tst_names = names.head(10)

In [None]:
batch_personal_name_in = openapi_client.BatchPersonalNameIn() # BatchPersonalNameIn | A list of personal names (optional)

In [None]:
list_of_names