In [1]:
import datetime
import xml.etree.ElementTree as ET
parser = ET.XMLParser()
    
file = 'data/clean_dblp.xml'

time = datetime.datetime.now()
print("Starting to parse XML file at {} ...".format(time))
tree = ET.parse(file, parser=parser)
time = datetime.datetime.now()
print("Finished parsing XML file at {} ! ".format(time))
root = tree.getroot()
print("Found {} entries! ".format(len(root)))

Starting to parse XML file at 2019-04-11 11:42:00.761575 ...
Finished parsing XML file at 2019-04-11 11:42:56.931092 ! 
Found 4544385 entries! 


In [17]:
# Get all authors
names_complete = {}

print("Starting to extract author names...")

for child in root:
    # Get authors
    authors = child.findall("author")
    names = []
    
    for author in authors:
        names.append(author.text)
    
    for name in names:
        if (name in names_complete):
            names_complete[name]['n_publs'] += 1
        else:
            names_complete[name] = {
                'name': name,
                'n_publs': 1
            }

        
print("Finished extracting author names! Found {} entries.".format(len(names_complete)))

Starting to extract author names and publications...
Finished extracting author names! Found 2306418 entries.


In [25]:
import pandas as pd

# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html
print("Converting dictionary of names to a dataframe...")
names = pd.DataFrame.from_dict(names_complete, orient='index', dtype=None)
names["likely_gender"] = "not determined"
names["score"] = "0"
print("Dataframe completed! Here are random entries:")

Converting dictionary of names to a dataframe...
Dataframe completed! Here are random entries:


In [26]:
sample = names.sample(n=10)

print(sample)

                                        name  n_publs   likely_gender score
Sui Wei                              Sui Wei        6  not determined     0
Servando Espejo              Servando Espejo        6  not determined     0
Rajakrishnan Rajkumar  Rajakrishnan Rajkumar       11  not determined     0
L. Y. Wang                        L. Y. Wang        3  not determined     0
Xiubin Zhuang                  Xiubin Zhuang        1  not determined     0
Woo-Cheol Cho                  Woo-Cheol Cho        2  not determined     0
Nancy M. Salbach            Nancy M. Salbach        1  not determined     0
Shun-Te Tseng                  Shun-Te Tseng        3  not determined     0
Matthew Kauer                  Matthew Kauer        2  not determined     0
Xun Hu                                Xun Hu        3  not determined     0


In [27]:
# Save Dataframe to CSV: https://riptutorial.com/pandas/example/19502/create-random-dataframe-and-write-to--csv
print("Saving dataframe as CSV...")
names.to_csv("data/names.csv")
print("Names saved as CSV!")

Saving dataframe as CSV...
Names saved as CSV!


In [28]:
print("Sorting names by the amount of publications ...")
srtd_n_publs = names.sort_values("n_publs", ascending=False)

Sorting names by the amount of publications ...


In [29]:
print("The most publishing authors are ...")
# https://stackoverflow.com/questions/15006298/how-to-preview-a-part-of-a-large-pandas-dataframe-in-ipython-notebook
srtd_n_publs.head(10)

The most publishing authors are ...


Unnamed: 0,name,n_publs,likely_gender,score
H. Vincent Poor,H. Vincent Poor,1694,not determined,0
Mohamed-Slim Alouini,Mohamed-Slim Alouini,1307,not determined,0
Philip S. Yu,Philip S. Yu,1253,not determined,0
Wei Wang,Wei Wang,1200,not determined,0
Wei Zhang,Wei Zhang,1189,not determined,0
Lajos Hanzo,Lajos Hanzo,1183,not determined,0
Wen Gao 0001,Wen Gao 0001,1174,not determined,0
Wei Li,Wei Li,1138,not determined,0
Yu Zhang,Yu Zhang,1098,not determined,0
Yang Liu,Yang Liu,1092,not determined,0


In [None]:
print("Saving sorted dataframe as CSV...")
names.to_csv("data/names_srtd.csv")
print("Sorted names saved as CSV!")