In [None]:
# HYPOTHESIS TESTING

In [None]:
# Hypothesis 2:
# Teams of collaborating authors consist of an equal amount of men and women.

In [None]:
# >>> Preparation
import pandas

print("Importing publications... ")
cols = [
    'key', 
    'pub_cat',
    'pub_type',
    'title',
    'year',
    'authors',
    'n_authors',
    'n_males',
    'n_females',
    'n_unknown',
    'ratio',
    'n_contribs_females',
    'n_contribs_males',
    'n_contribs_unknown'
]

publs = pandas.read_csv("../_data/publications.csv", usecols=cols)

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
publs = publs.set_index("key", drop = False)
print("Publications imported. They look like this: {}".format(publs[:1]))

In [None]:
# get only those publications created by teams
team_publs = publs[publs['n_authors'] > 1]
not_all_unknown = team_publs[team_publs["n_unknown"] != team_publs["n_authors"]]

In [None]:
# Calculate differences for Sign Test
print("Calculating differences between amount of men and women per team for Sign Test...")

diff = not_all_unknown["n_contribs_males"] - not_all_unknown["n_contribs_females"]
diff = diff.values

print("{} differences calculated.".format(len(diff)))

In [None]:
# >>> Testing
# Sign Test using Wilcoxon function https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html

print("Executing Sign Test...")

from scipy.stats import wilcoxon
W, p = wilcoxon(diff)

print("Test returned W={} and two-sided p-value={}".format(W, p))

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['pgf.rcfonts'] = False
mpl.rcParams['font.size'] = 11.0

print("Visualizing Gender Inequality Index Histogram...")

not_all_unknown["ratio"].hist(color="grey", bins=20)
plt.xlabel("Gender Inequality Index")
plt.ylabel("Amount of publications")

plt.savefig('../_graphs/h2/gender_inequality_index_hist_not_all_unknown.pdf')
plt.savefig('../_graphs/h2/gender_inequality_index_hist_not_all_unknown.pgf')

print("Graphs saved at ../_graphs/h2/gender_inequality_index_hist_not_all_unknown")

In [None]:
# >>> Evaluation
# Hypothesis 2 is not correct because p < 0.05. 