In [None]:
import datetime
import xml.etree.ElementTree as ET
parser = ET.XMLParser()
    
file = 'data/clean_dblp.xml'

time = datetime.datetime.now()
print("Starting to parse XML file at {} ...".format(time))
tree = ET.parse(file, parser=parser)
time = datetime.datetime.now()
print("Finished parsing XML file at {} ! ".format(time))
root = tree.getroot()
print("Found {} entries! ".format(len(root)))

In [None]:
# Get all authors
names_complete = {}

print("Starting to extract author names...")

for child in root:
    # Get authors
    authors = child.findall("author")
    names = []
    
    for author in authors:
        names.append(author.text)
    
    for name in names:
        if (name in names_complete):
            names_complete[name]['n_publs'] += 1
        else:
            names_complete[name] = {
                'name': name,
                'n_publs': 1
            }

        
print("Finished extracting author names! Found {} entries.".format(len(names_complete)))

In [None]:
import pandas as pd

# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html
print("Converting dictionary of names to a dataframe...")
names = pd.DataFrame.from_dict(names_complete, orient='index', dtype=None)
names["likely_gender"] = "not determined"
names["score"] = "0"
print("Dataframe completed! Here are random entries:")

In [None]:
sample = names.sample(n=10)

print(sample)

In [None]:
# Save Dataframe to CSV: https://riptutorial.com/pandas/example/19502/create-random-dataframe-and-write-to--csv
print("Saving dataframe as CSV...")
names.to_csv("data/names.csv")
print("Names saved as CSV!")

In [None]:
print("Sorting names by the amount of publications ...")
srtd_n_publs = names.sort_values("n_publs", ascending=False)

In [None]:
print("The most publishing authors are ...")
# https://stackoverflow.com/questions/15006298/how-to-preview-a-part-of-a-large-pandas-dataframe-in-ipython-notebook
srtd_n_publs.head(10)

In [None]:
print("Saving sorted dataframe as CSV...")
names.to_csv("data/names_srtd.csv")
print("Sorted names saved as CSV!")