In [None]:
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:10]))

In [None]:
names.describe()

In [None]:
names[names['likely_gender'] == 'female'].describe()

In [None]:
names[names['likely_gender'] == 'male'].describe()

In [None]:
import matplotlib.pyplot as plt

In [None]:
# >>> Explore authors and publications per gender
# Absolute values
nauthors_by_gender = names.groupby("likely_gender").size()
n_publs_by_gender = names.groupby("likely_gender").agg({'n_publs':'sum'})

In [None]:
print("{} female authors published {} pieces, {} male authors published {} pieces".format(nauthors_by_gender.values[0], n_publs_by_gender.at['female', 'n_publs'], nauthors_by_gender.values[1], n_publs_by_gender.at['male', 'n_publs']))

In [None]:
print("Men published {} times more than women".format(n_publs_by_gender.at['male', 'n_publs'] / n_publs_by_gender.at['female', 'n_publs']))

In [None]:
print("There are {} times more male than female authors.".format(nauthors_by_gender.values[1] / nauthors_by_gender.values[0]))

In [None]:
# Visualize totals
plt.bar(nauthors_by_gender.index.values, nauthors_by_gender.values, label='Amount of authors by gender', alpha=0.5, color='b')
plt.bar(n_publs_by_gender.index.values, n_publs_by_gender['n_publs'], label='Amount of publications by gender', alpha=0.5, color='g')
plt.legend(loc='upper left')
plt.title("How many authors are there per gender vs how many publications?")
plt.show()

In [None]:
# Relative values
n_publs_mean_by_gender = names.groupby("likely_gender").agg({'n_publs':'mean'})

In [None]:
print("On average, female authors publish {} pieces each, while male authors publish {} pieces each.".format(n_publs_mean_by_gender.at['female', 'n_publs'], n_publs_mean_by_gender.at['male', 'n_publs']))

In [None]:
plt.bar(n_publs_mean_by_gender.index.values, n_publs_mean_by_gender['n_publs'], label='Average amount of publications by gender', alpha=0.5, color='g')
plt.legend(loc='upper left')
plt.title("How many publications are there on average per gender?")
plt.show()

In [None]:
plt.scatter(names['likely_gender'], names['n_publs'])

In [None]:
m_publs_dist = names.groubby('n_publs').size()

In [None]:
n_publs_dist.plt()