In [None]:
# 2. STEP
# In this notebook, the names of all authors are gathered from 'clean_dblp.xml'
# Save the resulting names to 'names.csv'

In [None]:
# >>> Import and parse clean_dblp.xml

import datetime
import xml.etree.ElementTree as ET
parser = ET.XMLParser()
    
file = 'data/clean_dblp.xml'

time = datetime.datetime.now()
print("Starting to parse XML file at {} ...".format(time))
tree = ET.parse(file, parser=parser)
time = datetime.datetime.now()
print("Finished parsing XML file at {} ! ".format(time))
root = tree.getroot()
print("Found {} entries! ".format(len(root)))

In [None]:
# >>> Get all authors and save to a list of dictionaries
names_complete = {}

print("Starting to extract author names...")

for child in root:
    # Get authors
    authors = child.findall("author")
    names = []
    
    for author in authors:
        names.append(author.text)
    
    for name in names:
        if (name in names_complete):
            names_complete[name]['n_publs'] += 1
        else:
            names_complete[name] = {
                'name': name,
                'n_publs': 1
            }

print("Finished extracting author names! Found {} entries.".format(len(names_complete)))

In [None]:
# >>> Convert the list of dictionaries to a dataframe
# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html

import pandas as pd

print("Converting dictionary of names to a dataframe...")

names = pd.DataFrame.from_dict(names_complete, orient='index', dtype=None)
names["likely_gender"] = "not determined"
names["score"] = "0"

print("Dataframe completed! Here are entries: {}".format(names[:10]))

In [None]:
# >>> Save Dataframe to names.csv
# Save Dataframe to CSV: https://riptutorial.com/pandas/example/19502/create-random-dataframe-and-write-to--csv

print("Saving dataframe as CSV...")
names.to_csv("data/names.csv")
print("Names saved as CSV!")