In [None]:
# Hypothesis Testing
# 6: Over time, the share of (contributions made by) female authors stays the same

In [None]:
# >>> Preparation
import pandas

In [None]:
print("Importing publications... ")
cols = [
    'key', 
    'pub_cat',
    'pub_type',
    'title',
    'year',
    'authors',
    'n_authors',
    'n_males',
    'n_females',
    'n_unknown',
    'ratio',
    'n_contribs_females',
    'n_contribs_males',
    'n_contribs_unknown',
]

publs = pandas.read_csv("data/publications_cor3.csv", usecols=cols)

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
publs = publs.set_index("key", drop = False)
print("Publications imported. They look like this: {}".format(publs[:5]))

In [None]:
# Create dataset of publications by year
publs_by_year_group = publs.groupby("year")

# How many contributions by year?
publs_sum_by_year = publs_by_year_group.count()

In [None]:
# Find out the minimum and maximum valid year:
min_year = publs_sum_by_year[publs_sum_by_year['key'] > 1000].index.values.min() # 1966
max_year = 2018 # To Adjust: We only have 4 months of 2019 with the dataset of April 2019

In [None]:
publs_lim = publs[(publs['year'] >= min_year) & (publs['year'] <= max_year)]

In [None]:
# Get data
publs_lim_grouped = publs_lim.groupby("year").sum()
publs_lim_grouped["year"] = publs_lim_grouped.index.values.tolist()

In [None]:
publs_lim_grouped["fem_share_contribs"] = publs_lim_grouped["n_contribs_females"] / (publs_lim_grouped["n_contribs_females"] + publs_lim_grouped["n_contribs_males"] + publs_lim_grouped["n_contribs_unknown"]) * 100

In [None]:
publs_lim_grouped["fem_share_authors"] = publs_lim_grouped["n_females"] / publs_lim_grouped["n_authors"] * 100

In [None]:
# >>> Calculation
from scipy import stats
print("Calculating Spearman's Correlation Coefficient...")
publs_lim_cor = stats.spearmanr(publs_lim_grouped["year"], publs_lim_grouped["fem_share_authors"])
print("Done!")
print(publs_lim_cor)
# p value can not be trusted since 53 observations < 500: https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.spearmanr.html

In [None]:
# >>> Visualization
import matplotlib.pyplot as plt
plt.scatter(publs_lim_grouped["year"], publs_lim_grouped["fem_share_authors"], label="Female authors", color="xkcd:dark grey")
plt.scatter(publs_lim_grouped["year"], publs_lim_grouped["fem_share_contribs"], label="Contributions by female authors", color="xkcd:grey")
plt.xlabel("Year")
plt.ylabel("Share in %")
plt.legend()
plt.title("Share of female authors per year (1966 - 2018)")

In [None]:
# >>> Interpretation:
# The hypothesis is rejected. The share of (contributions made by) female authors grows slightly.