In [None]:
# Hypothesis Testing
# 6: Over time, the gender distribution in teams of authors became more equal

In [None]:
# >>> Preparation
import pandas

In [None]:
print("Importing publications... ")
cols = [
    'key', 
    'pub_cat',
    'pub_type',
    'title',
    'year',
    'authors',
    'n_authors',
    'n_males',
    'n_females',
    'n_unknown',
    'ratio',
    'n_contribs_females',
    'n_contribs_males',
    'n_contribs_unknown',
]

publs = pandas.read_csv("data/publications_cor2.csv", usecols=cols)

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
publs = publs.set_index("key", drop = False)
print("Publications imported. They look like this: {}".format(publs[:5]))

In [None]:
# >>> Correlation Matrix
# Creation
publs_cor = publs.corr(method='spearman')

In [None]:
# Visualization
# visualize correlation matrix https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
# annotated https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm

autumn = cm.get_cmap('RdBu', 10)

# draw a big matrix: https://stackoverflow.com/questions/332289/how-do-you-change-the-size-of-figures-drawn-with-matplotlib
fig, ax = plt.subplots(figsize=(20, 10))
im = ax.imshow(publs_cor, cmap=autumn)

plt.xticks(np.arange(len(publs_cor.columns.values)), publs_cor.columns.values, rotation='vertical')
plt.yticks(np.arange(len(publs_cor.columns.values)), publs_cor.columns.values)

plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

for i in range(len(publs_cor.columns.values)):
    for j in range(len(publs_cor.columns.values)):
        # round https://www.tutorialspoint.com/python/number_round.htm
        text = ax.text(j, i, round(publs_cor.iloc[i, j], 2), ha="center", va="center", color="k")

plt.title("Correlation Matrix")

plt.show()

# We see that the year does not seem to correlate with the gender equality index ("ratio"): The spearman coefficient is -0.04

In [None]:
# Correlation matrix for relevant years only
# Creation

# Create dataset of publications by year
publs_by_year_group = publs.groupby("year")

# How many publications by year?
publs_sum_by_year = publs_by_year_group.count()

In [None]:
# Find out the minimum and maximum valid year:
min_year = publs_sum_by_year[publs_sum_by_year['key'] > 1000].index.values.min() # 1966
max_year = 2018 # To Adjust: We only have 4 months of 2019 with the dataset of April 2019

In [None]:
publs_lim = publs[(publs['year'] >= min_year) & (publs['year'] <= max_year)]

publs_lim_cor = publs_lim.corr(method='spearman') # correlation matrix

In [None]:
# Visualization
# visualize correlation matrix https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
# annotated https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html#sphx-glr-gallery-images-contours-and-fields-image-annotated-heatmap-py
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm

autumn = cm.get_cmap('autumn', 10)

# draw a big matrix: https://stackoverflow.com/questions/332289/how-do-you-change-the-size-of-figures-drawn-with-matplotlib
fig, ax = plt.subplots(figsize=(20, 10))
im = ax.imshow(publs_lim_cor, cmap=autumn)

plt.xticks(np.arange(len(publs_lim_cor.columns.values)), publs_lim_cor.columns.values, rotation='vertical')
plt.yticks(np.arange(len(publs_lim_cor.columns.values)), publs_lim_cor.columns.values)

plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

for i in range(len(publs_lim_cor.columns.values)):
    for j in range(len(publs_lim_cor.columns.values)):
        # round https://www.tutorialspoint.com/python/number_round.htm
        text = ax.text(j, i, round(publs_lim_cor.iloc[i, j], 2), ha="center", va="center", color="k")


plt.title("Correlation Matrix")

plt.show()

# We see that the year does not seem to correlate with the gender equality index even if 
# the years are taken out that have too few publications ("ratio"): The spearman coefficient is -0.04

In [None]:
# >>> Closer look at the distribution of the equality index
# Any publications
publs['ratio'].describe()

In [None]:
publs_lim['ratio'].describe()

In [None]:
# Publications written by 2 or more authors
only_coop_publs = publs[publs['n_authors'] > 1]
only_coop_publs_lim = publs_lim[publs_lim['n_authors'] > 1]

In [None]:
non_coop_publs = publs[publs['n_authors'] <= 1]
non_coop_publs_lim = publs_lim[publs_lim['n_authors'] <= 1]

In [None]:
bins = int(abs(publs['ratio'].max()) + abs(publs['ratio'].min()))
only_coop_publs['ratio'].hist(label="Publications written in cooperation (any year)", alpha=0.5)
only_coop_publs_lim['ratio'].hist(label="Publications written in cooperation (including years before 1966)", alpha=0.5)
plt.xlabel("Equality Index")
plt.legend()
plt.title("Histogram of Equality Index")
# The equality index only makes sense if the publication was written in cooperation.
# We see that there are over 3 Million cases where the equality index is over 50. 
# There are 1 Million cases where the equality index is 0, meaning perfect equality.
# This could be because of the huge amount of unknown authors.
# There are some few cases between 0 and 50 and very few where there are more women than men in a team of authors.
# Taking out those years with few publications has no influence.

In [None]:
# >>> Display development of ratio over time

In [None]:
# How many publications by year?
valid_publs_by_year_group = publs_lim.groupby("year")
valid_publs_sum_by_year = valid_publs_by_year_group.count()

In [None]:
from statistics import mode, pstdev
valid_ratio_mean_by_year = valid_publs_by_year_group.agg({'ratio':'mean'})
valid_ratio_median_by_year = valid_publs_by_year_group.agg({'ratio':'median'})
valid_ratio_std_by_year = valid_publs_by_year_group.agg({'ratio':pstdev})
#ratio_mode_by_year = valid_publs_by_year_group.agg({'ratio':mode})

In [None]:
cmap = plt.get_cmap("tab20c")

In [None]:
# Visualization
#https://matplotlib.org/gallery/api/two_scales.html

fig, ax1 = plt.subplots()

ax1.set_xlabel('Year')
ax1.set_ylabel('Equality Index')

ax1.plot(valid_ratio_median_by_year.index.values, valid_ratio_median_by_year['ratio'].values, color='g', label="Median Equality Index")
ax1.plot(valid_ratio_mean_by_year.index.values, valid_ratio_mean_by_year['ratio'].values, color='b', label="Mean Equality Index")
ax1.plot(valid_ratio_std_by_year.index.values, valid_ratio_std_by_year['ratio'].values, color='turquoise', label="Standard Deviation of Equality Index")

ax1.tick_params(axis='y')

# set y limits for a subplot: https://stackoverflow.com/questions/15858192/how-to-set-xlim-and-ylim-for-a-subplot-in-matplotlib
ax1.set_ylim([0,55])

plt.legend(loc="center left")

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

ax2.set_ylabel('Publications')  # we already handled the x-label with ax1
ax2.plot(valid_publs_sum_by_year.index.values, valid_publs_sum_by_year['key'].values, color='r', label="Amount of Publications")
ax2.tick_params(axis='y')

plt.title("Development of Equality Index vs. Amount of Publications")
plt.legend(loc="lower right")
plt.show()

In [None]:
# How many authors by year?
valid_publs_authors_by_year = valid_publs_by_year_group.agg({'n_authors':'sum'})

In [None]:
# How many female authors by year?
valid_publs_f_by_year = valid_publs_by_year_group.agg({'n_females':'sum'})

In [None]:
# How many male authors by year?
valid_publs_m_by_year = valid_publs_by_year_group.agg({'n_males':'sum'})

In [None]:
# How many unknown authors by year?
valid_publs_u_by_year = valid_publs_by_year_group.agg({'n_unknown':'sum'})

In [None]:
#https://matplotlib.org/gallery/api/two_scales.html

fig, ax1 = plt.subplots()

ax1.set_xlabel('Year')
ax1.set_ylabel('Equality Index')

ax1.plot(valid_ratio_median_by_year.index.values, valid_ratio_median_by_year['ratio'].values, color='g', label="Median Equality Index")
ax1.plot(valid_ratio_mean_by_year.index.values, valid_ratio_mean_by_year['ratio'].values, color='b', label="Mean Equality Index")
ax1.plot(valid_ratio_std_by_year.index.values, valid_ratio_std_by_year['ratio'].values, color='turquoise', label="Standard Deviation of Equality Index")

ax1.tick_params(axis='y')

# set y limits for a subplot: https://stackoverflow.com/questions/15858192/how-to-set-xlim-and-ylim-for-a-subplot-in-matplotlib
ax1.set_ylim([0,55])

plt.legend(loc="center left")

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

ax2.set_ylabel('Authors')  # we already handled the x-label with ax1
ax2.plot(valid_publs_authors_by_year.index.values, valid_publs_authors_by_year['n_authors'].values, color='xkcd:orange', label="Amount of Authors")
ax2.plot(valid_publs_f_by_year.index.values, valid_publs_f_by_year['n_females'].values, color='xkcd:rose', label="Amount of Female Authors")
ax2.plot(valid_publs_m_by_year.index.values, valid_publs_m_by_year['n_males'].values, color='xkcd:peach', label="Amount of Male Authors")
ax2.plot(valid_publs_u_by_year.index.values, valid_publs_u_by_year['n_unknown'].values, color='xkcd:salmon', label="Amount of Authors of Unknown Gender")
ax2.tick_params(axis='y')

plt.title("Development of Equality Index vs. Amount of Authors")
plt.legend(loc="lower left")
plt.show()

In [None]:
# >>> Evaluation
# The gender equality does not correlate with the year of the publication.
# It stagnates around the same value, looking at median and mean. The std also stays similar. 
# In other words: Yes, we see a slight improvement of the number of female authors, however, the total number of 
# authors also rises, thus the gender equality index does not improve.