In [None]:
# 8 STEP: CREATE DATAFRAME OF PUBLICATIONS
# In this Notebook, the dataframe we will work with for our analysis of gender equality
# will be created. For that, the publications and necessary info will be extracted from 'clean-dblp.xml'
# And completed with information on gender from 'names_cat.csv'

In [None]:
# >>> Import necessary data

# Import the publication data
import datetime
import xml.etree.ElementTree as ET
parser = ET.XMLParser()
    
file = 'data/clean_dblp.xml'

time = datetime.datetime.now()
print("Starting to parse XML file at {} ...".format(time))
tree = ET.parse(file, parser=parser)
time = datetime.datetime.now()
print("Finished parsing XML file at {} ! ".format(time))
root = tree.getroot()
print("Found {} entries! ".format(len(root)))

In [None]:
# Import categorized 'names_cat.csv'
import pandas

print("Importing categorized names that were improved twice before... ")
names = pandas.read_csv("data/names_cat_i2.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:10]))

In [None]:
# >>> Set the min. reliable score (found out in last step)
score_min = 5

In [None]:
import numpy as np

# >>> Get all publications and fill with info

# Get all gender related info
# The gender inequality index r tells us how much equalizing would be needed so there is an equal amount of male and 
# female authors. For example, if there are 4 authors in total, and 3 are male, 1 would need to be female in order 
# for the ratio between man and women to be equal. 
# It is calculated like so: 50% (the half) - 25% (percentage of women) = 25% (inequality index)

def get_gender_picture(n):
    "This function returns the amount of male, female and unknown authors as well as the gender inequality ratio (in %)"
    m_max = 0.0
    f_max = 0.0
    u_min = 0.0 #unknown
    r_max = 0.0#inequality index
    for name in n:
        score = names.at[name, "score"]
        gender = names.at[name, "likely_gender"]
        if(score >= score_min):
            if(gender == "male"):
                m_max += 1.0
            elif(gender == "female"):
                f_max += 1.0
    sm_total = len(n)
    
    sm_max = m_max + f_max #sum of male and female authors
    
    u_min = sm_total - sm_max

    # Inequality Index
    
    if(sm_max > 0):
        half_rel = sm_max / 2.0
        abs_m_to_be_replaced_by_f = half_rel - f_max
        rel_m_to_be_replaced_by_f = abs_m_to_be_replaced_by_f / sm_max * 100
        r_max = rel_m_to_be_replaced_by_f
    
    # Contributions
    worth_contrib = (1.0 / n_authors) if (n_authors > 0) else 0
    
    n_contribs_f_max = f_max * worth_contrib
    n_contribs_m_max = m_max * worth_contrib
    n_contribs_u_min = u_min * worth_contrib
   
    return {
        'males': m_max,
        'females': f_max,
        'unknowns': u_min,
        'ratio': r_max,
        
        'contribs_f': n_contribs_f_max,
        'contribs_m': n_contribs_m_max,
        'contribs_u': n_contribs_u_min
    }

In [None]:
# Get all publications
publications_complete = {}

print("Starting to extract publications...")

for child in root:
    # Extract: key, category of publication, 
    key = child.get('key')
    pub_cat = child.tag
    
    # Optionally extract: title, year of publication, publtype, pages
    titles = child.findall("title")
    if(len(titles) > 0): title = titles[0].text
    
    years = child.findall("year")
    if(len(years) > 0): year = years[0].text
    
    pub_type = child.get('publtype')
    
    # Get authors
    authors = child.findall("author")
    
    current_names = []
    
    for author in authors:
        current_names.append(author.text)
    
    # Get gender info
    n_authors = len(current_names)
    
    gender_pic = get_gender_picture(current_names)
    
    n_males = gender_pic['males']
    n_females = gender_pic['females']
    n_unknown = gender_pic['unknowns']
    ratio = gender_pic['ratio']
    
    n_contribs_f = gender_pic['contribs_f']
    n_contribs_m = gender_pic['contribs_m']
    n_contribs_u = gender_pic['contribs_u']
    
    # Add publication entry
    publications_complete[key] = {
        'key': key,
        'pub_cat': pub_cat,
        'pub_type': pub_type,
        'title': title,
        'year': year,
        
        'authors': current_names,
        'n_authors': n_authors,
        
        'n_males': n_males,
        'n_females': n_females,
        'n_unknown': n_unknown,
        'ratio': ratio,
        
        'n_contribs_females': n_contribs_f,
        'n_contribs_males': n_contribs_m,
        'n_contribs_unknown': n_contribs_u
    }

print("Finished extracting publications and calculating all needed information! Found {} entries.".format(len(publications_complete)))

In [None]:
# >>> Convert results into dataframe
# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html
print("Converting dictionary of publications to a dataframe...")
publications = pandas.DataFrame.from_dict(publications_complete, orient='index', dtype=None)
print("Dataframe completed! Here are some entries: {}".format(publications[:7]))

In [None]:
# >>> Save results
# Save Dataframe to CSV: https://riptutorial.com/pandas/example/19502/create-random-dataframe-and-write-to--csv
print("Saving dataframe as CSV...")
publications.to_csv("data/publications_cor3.csv")
print("Publications saved as CSV!")