In [None]:
# HYPOTHESIS TESTING
# Preparing the Data

In [None]:
# Hypothesis 1: 
# Overall, there are more male than female authors. 

# Hypothesis 2: 
# In total, more has been written by men than by women.

In [2]:
# >>> Import
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat_i2.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

Importing names... 
Names imported. They look like this:                                         name likely_gender  n_publs  score
name                                                                      
'Maseka Lesaoana            'Maseka Lesaoana        female        2      8
(David) Jing Dai            (David) Jing Dai          male        1      7
(Max) Zong-Ming Cheng  (Max) Zong-Ming Cheng          male        2      8
(Sophy) Shu-Jiun Chen  (Sophy) Shu-Jiun Chen        female        2      7
(Zhou) Bryan Bai            (Zhou) Bryan Bai          male        2      6


In [3]:
print("Importing publications... ")
cols = [
    'key', 
    'pub_cat',
    'pub_type',
    'title',
    'year',
    'authors',
    'n_authors',
    'n_males',
    'n_females',
    'n_unknown',
    'ratio',
    'n_males_min',
    'n_females_min',
    'n_unknown_max',
    'ratio_min'
]
publs = pandas.read_csv("data/publications_cor1.csv", usecols=cols)

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
publs = publs.set_index("key", drop = False)
print("Publications imported. They look like this: {}".format(publs[:5]))

Importing publications... 


  interactivity=interactivity, compiler=compiler, result=result)


Publications imported. They look like this:                                                                  key  \
key                                                                    
books/acm/0082477                                  books/acm/0082477   
books/acm/Kim95                                      books/acm/Kim95   
books/acm/kim95/AnnevelinkACFHK95  books/acm/kim95/AnnevelinkACFHK95   
books/acm/kim95/Blakeley95                books/acm/kim95/Blakeley95   
books/acm/kim95/BreitbartGS95          books/acm/kim95/BreitbartGS95   

                                        pub_cat pub_type  \
key                                                        
books/acm/0082477                          book      NaN   
books/acm/Kim95                            book      NaN   
books/acm/kim95/AnnevelinkACFHK95  incollection      NaN   
books/acm/kim95/Blakeley95         incollection      NaN   
books/acm/kim95/BreitbartGS95      incollection      NaN   

                              

In [None]:
# >>> Convert results into dataframe
# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html
print("Converting dictionary of publications to a dataframe...")
publications = pandas.DataFrame.from_dict(publications_complete, orient='index', dtype=None)
print("Dataframe completed! Here are some entries: {}".format(publications[:10]))

In [None]:
# >>> Save results
# Save Dataframe to CSV: https://riptutorial.com/pandas/example/19502/create-random-dataframe-and-write-to--csv
print("Saving dataframe as CSV...")
publications.to_csv("data/publications_cor2.csv")
print("Publications saved as CSV!")

In [13]:
def get_min_contribs(gender):
    "Returns the minimum amount of contributions for a gender."
    if(gender is not 'males' and gender is not 'females'):
        raise Exception("Gender must be female or male or unknown (sorry ._.).")
    result = 0
    for index, row in publs.iterrows():
        col = 'n_{}_min'.format(gender)
        n_gender = row[col] #minimum amount of female or male authors
        n_authors = row['n_authors'] #total amount of authors
        
        part = (1 / n_authors) * n_gender
        result += part
    return result

In [14]:
def get_max_contribs(gender):
    "Returns the maximum amount of contributions for a gender."
    if(gender is not 'males' and gender is not 'females'):
        raise Exception("Gender must be female or male (sorry ._.).")
    result = 0
    for index, row in publs.iterrows():
        col = 'n_{}'.format(gender)
        n_gender = row[col] #maximum amount of female or male authors
        n_authors = row['n_authors'] #total amount of authors
        
        part = (1 / n_authors) * n_gender
        result += part
    return result

In [11]:
# The calculations would take too long if not executed in parallel
# Parallel Programming in Python: https://docs.python.org/3.3/library/multiprocessing.html
# How to return a value with a thread: https://stackoverflow.com/questions/6893968/how-to-get-the-return-value-from-a-thread-in-python

from multiprocessing.pool import ThreadPool

try:
    pool_size = multiprocessing.cpu_count()
except:
    pool_size = 1

pool = ThreadPool(pool_size)

f_min_async = pool.apply_async(get_min_contribs, ["females"])
m_min_async = pool.apply_async(get_min_contribs, ["males"])
f_max_async = pool.apply_async(get_max_contribs, ["females"])
m_max_async = pool.apply_async(get_max_contribs, ["males"])

n_contribs_f_min = f_min_async.get()

In [12]:
n_contribs_f_min

1426.116900217142

In [None]:
# >>> Testing
# Calculate statistics
n_authors_f_min = names[(names['score'] > 7) & (names['likely_gender'] == 'female')].count()['likely_gender']
n_authors_m_min = names[(names['score'] > 7) & (names['likely_gender'] == 'male')].count()['likely_gender']
n_authors_f_max = names[(names['score'] >= 5) & (names['likely_gender'] == 'female')].count()['likely_gender']
n_authors_m_max = names[(names['score'] >= 5) & (names['likely_gender'] == 'male')].count()['likely_gender']

n_contribs_f_min = get_min_contribs('females')
n_contribs_m_min = get_min_contribs('males')
n_contribs_f_max = get_max_contribs('females')
n_contribs_m_max = get_max_contribs('males')

n_contribs_f_min_v2 = names[(names['score'] > 7) & (names['likely_gender'] == 'female')]['n_publs'].sum()
n_contribs_m_min_v2 = names[(names['score'] > 7) & (names['likely_gender'] == 'male')]['n_publs'].sum()
n_contribs_f_max_v2 = names[(names['score'] >= 5) & (names['likely_gender'] == 'female')]['n_publs'].sum()
n_contribs_m_max_v2 = names[(names['score'] >= 5) & (names['likely_gender'] == 'male')]['n_publs'].sum()

In [None]:
# Logging results
print("At minimum, {} female authors made {} contributions and {} male authors made {} contributions".format(n_authors_f_min, n_contribs_f_min, n_authors_m_min, n_contribs_m_min))
print("At maximum, {} female authors made {} contributions and {} male authors made {} contributions".format(n_authors_f_max, n_contribs_f_max, n_authors_m_max, n_contribs_m_max))
print("")
print("At maximum, men contributed {} times more than women".format(n_contribs_m_min / n_contribs_f_min))
print("At minimum, men contributed {} times more than women".format(n_contribs_m_max / n_contribs_f_max))
print("")
print("At minimum, {} female authors contributed to {} publications and {} male authors contributed to {} publications (publications can be counted several times)".format(n_authors_f_min, n_contribs_f_min_v2, n_authors_m_min, n_contribs_m_min_v2))
print("At maximum, {} female authors contributed to {} publications and {} male authors contributed to {} publications (publications can be counted several times)".format(n_authors_f_max, n_contribs_f_max_v2, n_authors_m_max, n_contribs_m_max_v2))
print("")
print("At maximum, men contributed to publications {} times more than women (publications can be counted several times)".format(n_contribs_m_min_v2 / n_contribs_f_min_v2))
print("At minimum, men contributed to publications {} times more than women (publications can be counted several times)".format(n_contribs_m_max_v2 / n_contribs_f_max_v2))
print("")
print("There are at maximum {} times more male than female authors.".format(n_authors_m_min / n_authors_f_min))
print("There are at minimum {} times more male than female authors.".format(n_authors_m_max / n_authors_f_max))

'''
At minimum, 19869 female authors made 1426.116900217142 contributions and 539864 male authors made 66361.64460112316 contributions
At maximum, 144198 female authors made 10384.877476045609 contributions and 1112117 male authors made 133206.39590599976 contributions

At maximum, men contributed 46.53310299528662 times more than women
At minimum, men contributed 12.826958836371613 times more than women

At minimum, 19869 female authors contributed to 86725 publications and 539864 male authors contributed to 3525581 publications (publications can be counted several times)
At maximum, 144198 female authors contributed to 593208 publications and 1112117 male authors contributed to 7120339 publications (publications can be counted several times)

At maximum, men contributed to publications 40.65241856442779 times more than women (publications can be counted several times)
At minimum, men contributed to publications 12.003106836050762 times more than women (publications can be counted several times)

There are at maximum 27.17117117117117 times more male than female authors.
There are at minimum 7.712430130792383 times more male than female authors.

'''

In [None]:
# >>> Save progress
# Save statistics to dict
stats = {
    1: {
        'n_authors_f_min': n_authors_f_min,
        'n_authors_m_min': n_authors_m_min,
        'n_authors_f_max': n_authors_f_max,
        'n_authors_m_max': n_authors_m_max,
        'n_contribs_f_min': n_contribs_f_min,
        'n_contribs_m_min': n_contribs_m_min,
        'n_contribs_f_max': n_contribs_f_max,
        'n_contribs_m_max': n_contribs_m_max,
        'n_contribs_f_min_v2': n_contribs_f_min_v2,
        'n_contribs_m_min_v2': n_contribs_m_min_v2,
        'n_contribs_f_max_v2': n_contribs_f_max_v2,
        'n_contribs_m_max_v2': n_contribs_m_max_v2
    }
}

In [None]:
# Convert results into dataframe
# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html
print("Converting dictionary of stats to a dataframe...")
stats_df = pandas.DataFrame.from_dict(stats)
print("Dataframe completed! Here are some entries: {}".format(stats_df[:5]))

In [None]:
# Save Dataframe to CSV: https://riptutorial.com/pandas/example/19502/create-random-dataframe-and-write-to--csv
print("Saving dataframe as CSV...")
stats_df.to_csv("data/stats_authors_contribs_by_gender.csv")
print("Stats saved as CSV!")

In [None]:
cmap(1)

In [None]:
# >>> Visualization
# Preparation

import matplotlib.pyplot as plt
import numpy as np

size = 0.3

# Colors for the pie chart
cmap = plt.get_cmap("tab20c")
cmap_2 = plt.get_cmap("tab20b")
text = cmap(16)
very_outer_colors = [cmap(9), cmap_2(10)]
outer_colors = cmap([1,5]) # Publications
inner_colors = [cmap(13), cmap_2(13)] # Authors

In [None]:
# Visualization of minimum values
# Nested pie plot: https://matplotlib.org/gallery/pie_and_polar_charts/nested_pie.html#sphx-glr-gallery-pie-and-polar-charts-nested-pie-py
# Labelling a pie chart: https://matplotlib.org/gallery/pie_and_polar_charts/pie_and_donut_labels.html#sphx-glr-gallery-pie-and-polar-charts-pie-and-donut-labels-py
# Placing multiple legends: https://matplotlib.org/tutorials/intermediate/legend_guide.html

fig, ax = plt.subplots()

genders = ['female', 'male']
authors = [n_authors_f_min, n_authors_m_min] 
publications = [n_contribs_f_min_v2, n_contribs_m_min_v2]
contributions = [n_contribs_f_min, n_contribs_m_min]

# Publications
wedges_p, texts_p, autotexts_p = ax.pie(publications, radius=1.6-size, 
                                        colors=very_outer_colors, autopct='%1.1f%%', 
                                        pctdistance=1.2,
                                        wedgeprops=dict(width=size, edgecolor='w'), 
                                        textprops=dict(color=text))
l_p = ax.legend(wedges_p, genders, bbox_to_anchor=(1.25, 0.5), title="Publications")
plt.gca().add_artist(l_p)

# Contributions
wedges_c, texts_c, autotexts_c = ax.pie(contributions, radius=1, 
                                        colors=outer_colors, autopct='%1.1f%%', 
                                        pctdistance=0.82,
                                        wedgeprops=dict(width=size, edgecolor='w'), 
                                        textprops=dict(color=text))
l_c = ax.legend(wedges_c, genders, bbox_to_anchor=(1.25, 0.75), title="Contributions")
plt.gca().add_artist(l_c)

# Authors
wedges_a, texts_a, autotexts_a = ax.pie(authors, radius=1-size, 
                                        colors=inner_colors, autopct='%1.1f%%', # form of percentages
                                        pctdistance=0.3, # how far away are percentages from center?
                                        wedgeprops=dict(width=size, edgecolor='w'), # color of edges
                                        textprops=dict(color=text)) # color of percentages
l_a = ax.legend(wedges_a, genders, bbox_to_anchor=(1.25, 1), title="Authors")
plt.gca().add_artist(l_a) # Add legend

plt.title("How many authors and how many publications \n are there in total per gender?\n")

plt.show()

In [None]:
# Visualization of minimum values
# Nested pie plot: https://matplotlib.org/gallery/pie_and_polar_charts/nested_pie.html#sphx-glr-gallery-pie-and-polar-charts-nested-pie-py
# Labelling a pie chart: https://matplotlib.org/gallery/pie_and_polar_charts/pie_and_donut_labels.html#sphx-glr-gallery-pie-and-polar-charts-pie-and-donut-labels-py
# Placing multiple legends: https://matplotlib.org/tutorials/intermediate/legend_guide.html

fig, ax = plt.subplots()

genders = ['female', 'male']
authors = [n_authors_f_max, n_authors_m_max] 
publications = [n_contribs_f_max_v2, n_contribs_m_max_v2]
contributions = [n_contribs_f_max, n_contribs_m_max]

# Publications
wedges_p, texts_p, autotexts_p = ax.pie(publications, radius=1.6-size, 
                                        colors=very_outer_colors, autopct='%1.1f%%', 
                                        pctdistance=1.2,
                                        wedgeprops=dict(width=size, edgecolor='w'), 
                                        textprops=dict(color=text))
l_p = ax.legend(wedges_p, genders, bbox_to_anchor=(1.25, 0.5), title="Publications")
plt.gca().add_artist(l_p)

# Contributions
wedges_c, texts_c, autotexts_c = ax.pie(contributions, radius=1, 
                                        colors=outer_colors, autopct='%1.1f%%', 
                                        pctdistance=0.82,
                                        wedgeprops=dict(width=size, edgecolor='w'), 
                                        textprops=dict(color=text))
l_c = ax.legend(wedges_c, genders, bbox_to_anchor=(1.25, 0.75), title="Contributions")
plt.gca().add_artist(l_c)

# Authors
wedges_a, texts_a, autotexts_a = ax.pie(authors, radius=1-size, 
                                        colors=inner_colors, autopct='%1.1f%%', # form of percentages
                                        pctdistance=0.3, # how far away are percentages from center?
                                        wedgeprops=dict(width=size, edgecolor='w'), # color of edges
                                        textprops=dict(color=text)) # color of percentages
l_a = ax.legend(wedges_a, genders, bbox_to_anchor=(1.25, 1), title="Authors")
plt.gca().add_artist(l_a) # Add legend

plt.title("How many authors and how many publications \n are there in total per gender?\n")

plt.show()

In [None]:
# >>> Evaluation
# Hypothesis 1 and 2 are correct. 
# Overall, there are more male than female authors. In total, more has been written by men than by women.