In [None]:
# HYPOTHESIS TESTING

In [None]:
# Hypothesis 1: 
# Overall, there are more male than female authors. 

# Hypothesis 2: 
# In total, more has been written by men than by women.

In [None]:
# >>> Preparation
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat_i1.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

In [None]:
# >>> Testing
nauthors_by_gender = names.groupby("likely_gender").size() # Amount of authors per gender
n_publs_by_gender = names.groupby("likely_gender").agg({'n_publs':'sum'}) # Amount of publications per gender

print("{} female authors published {} pieces, {} male authors published {} pieces".format(nauthors_by_gender.values[0], n_publs_by_gender.at['female', 'n_publs'], nauthors_by_gender.values[1], n_publs_by_gender.at['male', 'n_publs']))
print("Men published {} times more than women".format(n_publs_by_gender.at['male', 'n_publs'] / n_publs_by_gender.at['female', 'n_publs']))
print("There are {} times more male than female authors.".format(nauthors_by_gender.values[1] / nauthors_by_gender.values[0]))

In [None]:
# >>> Visualization

# Nested pie plot: https://matplotlib.org/gallery/pie_and_polar_charts/nested_pie.html#sphx-glr-gallery-pie-and-polar-charts-nested-pie-py
# Labelling a pie chart: https://matplotlib.org/gallery/pie_and_polar_charts/pie_and_donut_labels.html#sphx-glr-gallery-pie-and-polar-charts-pie-and-donut-labels-py
# Placing multiple legends: https://matplotlib.org/tutorials/intermediate/legend_guide.html

import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots()

size = 0.3

# Colors for the pie chart
cmap = plt.get_cmap("tab20c")
outer_colors = cmap([11,9,20]) # Publications
text_outer = cmap(8)
inner_colors = cmap(np.array([3,1,20])) # Authors
text_inner = cmap(0)

# Authors
wedges_a, texts_a, autotexts_a = ax.pie(nauthors_by_gender.values, radius=1-size, 
                                        colors=outer_colors, autopct='%1.1f%%', # form of percentages
                                        pctdistance=0.3, # how far away are percentages from center?
                                        wedgeprops=dict(width=size, edgecolor='w'), # color of edges
                                        textprops=dict(color=text_outer)) # color of percentages
l_a = ax.legend(wedges_a, nauthors_by_gender.index.values, loc="upper right", title="Authors")
plt.gca().add_artist(l_a) # Add legend

# Publications
wedges_p, texts_p, autotexts_p = ax.pie(n_publs_by_gender['n_publs'], radius=1, 
                                        colors=inner_colors, autopct='%1.1f%%', 
                                        pctdistance=1.2,
                                        wedgeprops=dict(width=size, edgecolor='w'), 
                                        textprops=dict(color=text_inner))
l_p = ax.legend(wedges_p, nauthors_by_gender.index.values, loc="lower right", title="Publications")
plt.gca().add_artist(l_p)


plt.title("How many authors and how many publications \n are there per gender?")

plt.show()

In [None]:
# >>> Evaluation
# Hypothesis 1 and 2 are correct. 
# Overall, there are more male than female authors. In total, more has been written by men than by women.