In [None]:
# 2. STEP
# In this notebook, the names of all authors are gathered from 'clean_dblp.xml'
# Save the resulting names to 'names.csv'

In [1]:
# >>> Import and parse clean_dblp.xml

import datetime
import xml.etree.ElementTree as ET
parser = ET.XMLParser()
    
file = 'data/clean_dblp.xml'

time = datetime.datetime.now()
print("Starting to parse XML file at {} ...".format(time))
tree = ET.parse(file, parser=parser)
time = datetime.datetime.now()
print("Finished parsing XML file at {} ! ".format(time))
root = tree.getroot()
print("Found {} entries! ".format(len(root)))

Starting to parse XML file at 2019-04-24 09:36:03.631106 ...
Finished parsing XML file at 2019-04-24 09:36:58.595046 ! 
Found 4544385 entries! 


In [2]:
# >>> Get all authors and save to a list of dictionaries
names_complete = {}

print("Starting to extract author names...")

for child in root:
    # Get authors
    authors = child.findall("author")
    names = []
    
    for author in authors:
        names.append(author.text)
    
    for name in names:
        if (name in names_complete):
            names_complete[name]['n_publs'] += 1
        else:
            names_complete[name] = {
                'name': name,
                'n_publs': 1
            }

print("Finished extracting author names! Found {} entries.".format(len(names_complete)))

Starting to extract author names...
Finished extracting author names! Found 2306418 entries.


In [3]:
# >>> Convert the list of dictionaries to a dataframe
# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html

import pandas as pd

print("Converting dictionary of names to a dataframe...")

names = pd.DataFrame.from_dict(names_complete, orient='index', dtype=None)
names["likely_gender"] = "not determined"
names["score"] = "0"

print("Dataframe completed! Here are entries: {}".format(names[:10]))

Converting dictionary of names to a dataframe...
Dataframe completed! Here are entries:                                         name  n_publs   likely_gender score
'Maseka Lesaoana            'Maseka Lesaoana        2  not determined     0
(David) Jing Dai            (David) Jing Dai        1  not determined     0
(Max) Zong-Ming Cheng  (Max) Zong-Ming Cheng        2  not determined     0
(Sophy) Shu-Jiun Chen  (Sophy) Shu-Jiun Chen        2  not determined     0
(Zhou) Bryan Bai            (Zhou) Bryan Bai        2  not determined     0
A Clara Kanmani              A Clara Kanmani        1  not determined     0
A Lun                                  A Lun        1  not determined     0
A Min Tjoa                        A Min Tjoa      211  not determined     0
A S Akshaya                      A S Akshaya        1  not determined     0
A'ang Subiyakto              A'ang Subiyakto        2  not determined     0


In [4]:
# >>> Save Dataframe to names.csv
# Save Dataframe to CSV: https://riptutorial.com/pandas/example/19502/create-random-dataframe-and-write-to--csv

print("Saving dataframe as CSV...")
names.to_csv("data/names.csv")
print("Names saved as CSV!")

Saving dataframe as CSV...
Names saved as CSV!
