In [None]:
# Hypothesis Testing
# 6: Over time, the share of (contributions made by) female authors stays the same

In [None]:
# >>> Preparation
import pandas

In [None]:
print("Importing publications... ")
cols = [
    'key', 
    'pub_cat',
    'pub_type',
    'title',
    'year',
    'authors',
    'n_authors',
    'n_males',
    'n_females',
    'n_unknown',
    'ratio',
    'n_contribs_females',
    'n_contribs_males',
    'n_contribs_unknown',
]

publs = pandas.read_csv("data/publications_cor3.csv", usecols=cols)

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
publs = publs.set_index("key", drop = False)
print("Publications imported. They look like this: {}".format(publs[:5]))

In [None]:
# Create dataset of publications by year
publs_by_year_group = publs.groupby("year")

# How many contributions by year?
publs_sum_by_year = publs_by_year_group.count()

In [None]:
# Calculate sample size
# https://www.wikihow.com/Calculate-Sample-Size
# https://www.qualtrics.com/de/erlebnismanagement/research-core/online-stichproben/?rid=langMatch&prevsite=en&newsite=de&geo=&geomatch=

z = 2.58
e = 0.05
d = 0.5

S = (z*z*d*(1-d))/(e*e)

In [None]:
# Find out the minimum and maximum valid year:
min_year = publs_sum_by_year[publs_sum_by_year['key'] > S].index.values.min() #  	1962
max_year = 2018 # To Adjust: We only have 4 months of 2019 with the dataset of April 2019

In [None]:
publs_lim = publs[(publs['year'] >= min_year) & (publs['year'] <= max_year)]

In [None]:
publs_lim.describe()

In [None]:
# Get data
publs_lim_grouped = publs_lim.groupby("year").sum()
publs_lim_grouped["year"] = publs_lim_grouped.index.values.tolist()

In [None]:
publs_lim_grouped["fem_share_contribs"] = publs_lim_grouped["n_contribs_females"] / (publs_lim_grouped["n_contribs_females"] + publs_lim_grouped["n_contribs_males"] + publs_lim_grouped["n_contribs_unknown"]) * 100

In [None]:
publs_lim_grouped["m_share_contribs"] = publs_lim_grouped["n_contribs_males"] / (publs_lim_grouped["n_contribs_females"] + publs_lim_grouped["n_contribs_males"] + publs_lim_grouped["n_contribs_unknown"]) * 100

In [None]:
publs_lim_grouped["u_share_contribs"] = publs_lim_grouped["n_contribs_unknown"] / (publs_lim_grouped["n_contribs_females"] + publs_lim_grouped["n_contribs_males"] + publs_lim_grouped["n_contribs_unknown"]) * 100

In [None]:
publs_lim_grouped["fem_share_authors"] = publs_lim_grouped["n_females"] / publs_lim_grouped["n_authors"] * 100

In [None]:
publs_lim_grouped["m_share_authors"] = publs_lim_grouped["n_males"] / publs_lim_grouped["n_authors"] * 100

In [None]:
publs_lim_grouped["u_share_authors"] = publs_lim_grouped["n_unknown"] / publs_lim_grouped["n_authors"] * 100

In [None]:
# >>> Calculation
from scipy import stats
print("Calculating Spearman's Correlation Coefficient...")
publs_lim_cor = stats.spearmanr(publs_lim_grouped["year"], publs_lim_grouped["fem_share_authors"])
print("Done!")
print(publs_lim_cor)
# p value can not be trusted since 53 observations < 500: https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.spearmanr.html

In [None]:
print("Calculating Spearman's Correlation Coefficient...")
publs_lim_cor = stats.spearmanr(publs_lim_grouped["year"], publs_lim_grouped["m_share_authors"])
print("Done!")
print(publs_lim_cor)

In [None]:
print("Calculating Spearman's Correlation Coefficient...")
publs_lim_cor = stats.spearmanr(publs_lim_grouped["year"], publs_lim_grouped["u_share_authors"])
print("Done!")
print(publs_lim_cor)

In [None]:
# >>> Visualization
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['pgf.rcfonts'] = False
mpl.rcParams['font.size'] = 11.0

plt.plot(publs_lim_grouped["year"], publs_lim_grouped["fem_share_authors"], label="Female authors", color="black")
plt.plot(publs_lim_grouped["year"], publs_lim_grouped["fem_share_contribs"], label="Contributions by female authors", color="grey")

plt.xlabel("Year")
plt.ylabel("Share in %")
plt.legend()

plt.savefig('graphs/h6/share_women_per_year_2.pdf')
plt.savefig('graphs/h6/share_women_per_year_2.pgf')

In [None]:
plt.plot(publs_lim_grouped["year"], publs_lim_grouped["fem_share_authors"], label="Female authors", color="black")
plt.plot(publs_lim_grouped["year"], publs_lim_grouped["u_share_authors"], label="Authors of unknown gender \n (score < 5)", color="grey")

plt.xlabel("Year")
plt.ylabel("Share in %")
plt.legend()

plt.savefig('graphs/h6/share_women_and_unknown_per_year.pdf')
plt.savefig('graphs/h6/share_women_and_unknown_per_year.pgf')

In [None]:
# >>> Interpretation:
# The hypothesis is rejected. The share of (contributions made by) female authors grows slightly.