In [None]:
# Hypothesis 1: 
# Overall, there are more male than female authors. 

# Hypothesis 2: 
# In total, more has been written by men than by women.

# Hypothesis 3:
# A male author is more productive than a female author.

In [None]:
# >>> Preparation
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

In [None]:
# Get all data on female and male authors
f = names[names['likely_gender'] == 'female']
m = names[names['likely_gender'] == 'male']

In [None]:
# >>> Hypothesis 1 and 2

# Hypothesis 1: 
# Overall, there are more male than female authors. 

# Hypothesis 2: 
# In total, more has been written by men than by women.

In [None]:
# Explore authors and publications per gender
# Absolute values
nauthors_by_gender = names.groupby("likely_gender").size() # Amount of authors per gender
n_publs_by_gender = names.groupby("likely_gender").agg({'n_publs':'sum'}) # Amount of publications per gender

print("{} female authors published {} pieces, {} male authors published {} pieces".format(nauthors_by_gender.values[0], n_publs_by_gender.at['female', 'n_publs'], nauthors_by_gender.values[1], n_publs_by_gender.at['male', 'n_publs']))
print("Men published {} times more than women".format(n_publs_by_gender.at['male', 'n_publs'] / n_publs_by_gender.at['female', 'n_publs']))
print("There are {} times more male than female authors.".format(nauthors_by_gender.values[1] / nauthors_by_gender.values[0]))

In [None]:
# Visualization
import matplotlib.pyplot as plt

In [None]:
plt.plot(names['name'], names['likely_gender'])

In [None]:
plt.plot(names['n_publs'], names['likely_gender'])

In [None]:
# Visualize absolute values of authors and publications by gender as a pie chart

# Nested pie plot: https://matplotlib.org/gallery/pie_and_polar_charts/nested_pie.html#sphx-glr-gallery-pie-and-polar-charts-nested-pie-py
# Labelling a pie chart: https://matplotlib.org/gallery/pie_and_polar_charts/pie_and_donut_labels.html#sphx-glr-gallery-pie-and-polar-charts-pie-and-donut-labels-py
# Placing multiple legends: https://matplotlib.org/tutorials/intermediate/legend_guide.html

import numpy as np

fig, ax = plt.subplots()

size = 0.3

# Colors for the pie chart
cmap = plt.get_cmap("tab20c")
outer_colors = cmap([11,9,20]) # Publications
text_outer = cmap(8)
inner_colors = cmap(np.array([3,1,20])) # Authors
text_inner = cmap(0)

# Authors
wedges_a, texts_a, autotexts_a = ax.pie(nauthors_by_gender.values, radius=1-size, 
                                        colors=outer_colors, autopct='%1.1f%%', # form of percentages
                                        pctdistance=0.3, # how far away are percentages from center?
                                        wedgeprops=dict(width=size, edgecolor='w'), # color of edges
                                        textprops=dict(color=text_outer)) # color of percentages
l_a = ax.legend(wedges_a, nauthors_by_gender.index.values, loc="upper right", title="Authors")
plt.gca().add_artist(l_a) # Add legend

# Publications
wedges_p, texts_p, autotexts_p = ax.pie(n_publs_by_gender['n_publs'], radius=1, 
                                        colors=inner_colors, autopct='%1.1f%%', 
                                        pctdistance=1.2,
                                        wedgeprops=dict(width=size, edgecolor='w'), 
                                        textprops=dict(color=text_inner))
l_p = ax.legend(wedges_p, nauthors_by_gender.index.values, loc="lower right", title="Publications")
plt.gca().add_artist(l_p)


plt.title("How many authors and how many publications \n are there per gender?")

plt.show()

In [None]:
# Evaluation: Hypothesis 1 and 2 are correct.

In [None]:
# >>> Hypothesis 3

# A male author is more productive than a female author.

In [None]:
# Relative values
mean = names['n_publs'].mean() # Overall mean of amount of publications
median = names['n_publs'].median() # Overall median of amount of authors

n_publs_mean_by_gender = names.groupby("likely_gender").agg({'n_publs':'mean'}) # Mean by gender
n_publs_median_by_gender = names.groupby("likely_gender").agg({'n_publs':'median'}) # Median by gender

f_mean = n_publs_mean_by_gender.at["female", "n_publs"] # mean for female authors
m_mean = n_publs_mean_by_gender.at["male", "n_publs"] # mean for male authors

f_median = n_publs_median_by_gender.at["female", "n_publs"] # median for female authors
m_median = n_publs_median_by_gender.at["male", "n_publs"] # median for male authors


print("On average, female authors publish {} pieces each, while male authors publish {} pieces each.".format(f_mean, m_mean))
print("The median for the amount of pieces published by female authors is {} pieces, while the median for the amount of pieces published by male authors is {} pieces.".format(f_median, m_median))

In [None]:
# Visualization
plt_mean = plt.plot(['male', 'female'], [mean, mean], 'r--', alpha=0.5, label='Overall Mean')
plt_g_mean = plt.plot(['male','female'], [m_mean, f_mean], 'rx', label='Mean by Gender')

plt_median = plt.plot(['male', 'female'],[median, median], 'b--', alpha=0.5, label='Overall Median')
plt_g_median = plt.plot(['male', 'female'],[m_median, f_median], 'bx', label='Median by Gender')

plt.xlabel('Gender')
plt.ylabel('Amount of publications')

plt.title("How many publications are there on average per gender?")

plt.legend(loc="center")
plt.show()

In [None]:
# Evaluation: Hypothesis 3 is correct.