In [None]:
# HYPOTHESIS TESTING

In [None]:
# Hypothesis 4:
# Teams of collaborating authors mostly consist of more men than women.

#Hypothesis 5:
# Bigger teams are more diverse.

In [None]:
# >>> Preparation
import pandas

print("Importing publications... ")
cols = [
    'key', 
    'pub_cat',
    'pub_type',
    'title',
    'year',
    'authors',
    'n_authors',
    'n_males',
    'n_females',
    'n_unknown',
    'ratio',
    'n_contribs_females',
    'n_contribs_males',
    'n_contribs_unknown'
]

publs = pandas.read_csv("data/publications_cor2.csv", usecols=cols)

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
publs = publs.set_index("key", drop = False)
print("Publications imported. They look like this: {}".format(publs[:5]))

In [None]:
# get only those publications created by teams
team_publs = publs[publs['n_authors'] > 1]

In [None]:
# Outlier calculation
print("Calculating from what point on a team is an outlier, considering its size...")
import numpy as np
lower = np.percentile(team_publs["n_authors"], 25)
higher = np.percentile(team_publs["n_authors"], 75)
interquart_range = higher - lower
outer_fence = higher + interquart_range * 3
print("The outer fence is {}".format(outer_fence))

In [None]:
# get only those publications created by regular small teams
small_teams = team_publs[team_publs["n_authors"] <= outer_fence]

In [None]:
# >>> Testing
# H4

# Finding the mode of a series: https://stackoverflow.com/questions/10797819/finding-the-mode-of-a-list
from statistics import mode

# Relative values
# Publications Contributed to
# Finding the mode of a series: https://stackoverflow.com/questions/10797819/finding-the-mode-of-a-list
print("Calculating the mode...")

mode_all = mode(small_teams["ratio"])

not_all_unknown = small_teams[small_teams["n_unknown"] != small_teams["n_authors"]]
mode_not_all_unknown = mode(not_all_unknown["ratio"])

print("The mode for all is {}.".format(mode_all))
print("The mode for those where at least one author could be classified is {}.".format(mode_not_all_unknown))

In [None]:
# Calculate best case scenario of gender equality for women considering all unclassified authors to be female.
def new_ratio(n, f, u):
    half_rel = n / 2.0
    abs_m_to_be_replaced_by_f = half_rel - (f + u)
    rel_m_to_be_replaced_by_f = abs_m_to_be_replaced_by_f / n * 100
    r_max = rel_m_to_be_replaced_by_f
    return r_max

print("Calculating ratios for the case that all unclassified authors are women...")

ratios = []

for index, row in small_teams.iterrows():
    n = row["n_authors"]
    f = row["n_females"]
    u = row["n_unknown"]
    ratios.append(new_ratio(n, f, u))
    
print("Done!")

ratios_df = pd.DataFrame({'ratio':ratios})
ratios_df.describe()

In [None]:
ratios_df = pandas.DataFrame({'ratio':ratios})
ratios_df.describe()

In [None]:
mode(ratios_df["ratio"])

In [None]:
ratios_df["ratio"].hist(color="orange", bins=20)
plt.title("Histogram of gender inequality index of publications\n contributed to by more than one author\n (assuming all unclassified authors are female).")
plt.xlabel("Gender Inequality Index")
plt.ylabel("Amount of publications")

In [None]:
# Visualization
# Show histogram considering not considering publications where none of the authors could be classified
import matplotlib.pyplot as plt
print("Visualizing Gender Inequality Index Histogram without publications, where no author could be classified...")
team_publs["ratio"].hist(alpha=0, color="orange", bins=20)
not_all_unknown["ratio"].hist(color="orange", bins=20)
plt.title("Histogram of gender inequality index of publications\n contributed to by more than one author\n where at least one author was classified by gender.")
plt.xlabel("Gender Inequality Index")
plt.ylabel("Amount of publications")
#team_publs_all_unknown["ratio"].hist(alpha=0.5, color="orange")

In [None]:
# Show histogram considering authors of unknown gender
print("Visualizing Gender Inequality Index Histogram...")
team_publs["ratio"].hist(color="orange", bins=20)
not_all_unknown["ratio"].hist(color="orange", bins=20)
plt.title("Histogram of gender inequality index of publications\n contributed to by more than one author\n")
plt.xlabel("Gender Inequality Index")
plt.ylabel("Amount of publications")

In [None]:
# >>> Evaluation
# Hypothesis 4 is correct. 

In [None]:
# >>> Testing
# H5 Bigger teams are more diverse.

In [None]:
team_publs_by_n_authors = small_teams.groupby("n_authors")

In [None]:
# Calculation of distribution statistics.
def m(ratio):
    '''This function is necessary for calculating the mode because otherwise, 
    an exception might be thrown if there is more than one mode.'''
    try:
        return mode(ratio)
    except:
        return None

In [None]:
print("Calculating distribution statistics: mode, mean, std, median - per team size...")
team_ratio_mode = team_publs_by_n_authors.agg({'ratio':m})
team_ratio_mode["n_authors"] = team_ratio_mode.index.values.tolist()

team_ratio_mean = team_publs_by_n_authors.agg({'ratio':'mean'})
team_ratio_mean["n_authors"] = team_ratio_mean.index.values.tolist()

team_ratio_std = team_publs_by_n_authors.agg({'ratio':'std'})
team_ratio_std["n_authors"] = team_ratio_std.index.values.tolist()

team_ratio_median = team_publs_by_n_authors.agg({'ratio':'median'})
team_ratio_median["n_authors"] = team_ratio_median.index.values.tolist()
print("Done!")

In [None]:
print("Calculating Spearman's Correlation Coefficient...")
small_teams_cor = small_teams.corr(method="spearman")
print("Done!")
print(small_teams_cor["n_authors"])
'''No correlation found.'''

In [None]:
# >>> Visualization
print("Visualizing distribution statistics per team size as well as individual gender inequality ratio values.")
plt.plot(team_ratio_mode["ratio"], label="Mode", color="green")
plt.plot(team_ratio_median["ratio"], label="Median", color="blue")
plt.plot(team_ratio_mean["ratio"], label="Mean", color="orange")
plt.plot(team_ratio_std["ratio"], label="Standard Deviation", color="red")
plt.scatter(small_teams["n_authors"], small_teams["ratio"], alpha=0.01, color="grey", label="Gender Inequality Index")
plt.title("Are bigger teams more diverse?")
plt.xlabel("Team Size")
plt.ylabel("Gender Inequality Index")
# Anchor a legend outside the plot: https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
plt.legend(bbox_to_anchor=(1.5, 1))

In [None]:
# Calculate best case scenario of gender equality for women considering all unclassified authors to be female.
def new_ratio(n, f, u):
    half_rel = n / 2.0
    abs_m_to_be_replaced_by_f = half_rel - (f + u)
    rel_m_to_be_replaced_by_f = abs_m_to_be_replaced_by_f / n * 100
    r_max = rel_m_to_be_replaced_by_f
    return r_max

print("Calculating ratios for the case that all unclassified authors are women...")

ratios_2 = {}

for index, row in small_teams.iterrows():
    n = row["n_authors"]
    f = row["n_females"]
    u = row["n_unknown"]
    ratios_2[index] = { "ratio": new_ratio(n, f, u),
                      "n_authors": n }
    
print("Done!")

ratios_df_2 = pandas.DataFrame.from_dict(ratios_2, orient="index")
ratios_df_2.describe()

In [None]:
team_publs_by_n_authors_2 = ratios_df_2.groupby("n_authors")

In [None]:
print("Calculating distribution statistics: mode, mean, std, median - per team size...")
team_ratio_mode_2 = team_publs_by_n_authors_2.agg({'ratio':m})
team_ratio_mode_2["n_authors"] = team_ratio_mode_2.index.values.tolist()

team_ratio_mean_2 = team_publs_by_n_authors_2.agg({'ratio':'mean'})
team_ratio_mean_2["n_authors"] = team_ratio_mean_2.index.values.tolist()

team_ratio_std_2 = team_publs_by_n_authors_2.agg({'ratio':'std'})
team_ratio_std_2["n_authors"] = team_ratio_std_2.index.values.tolist()

team_ratio_median_2 = team_publs_by_n_authors_2.agg({'ratio':'median'})
team_ratio_median_2["n_authors"] = team_ratio_median_2.index.values.tolist()
print("Done!")

In [None]:
print("Calculating Spearman's Correlation Coefficient...")
ratios_df_2_cor = ratios_df_2.corr(method="spearman")
print("Done!")
print(ratios_df_2_cor["n_authors"])
'''No correlation found.'''

In [None]:
# >>> Visualization
print("Visualizing distribution statistics per team size as well as individual gender inequality ratio values.")
plt.plot(team_ratio_mode_2["ratio"], label="Mode", color="green")
plt.plot(team_ratio_median_2["ratio"], label="Median", color="blue")
plt.plot(team_ratio_mean_2["ratio"], label="Mean", color="orange")
plt.plot(team_ratio_std_2["ratio"], label="Standard Deviation", color="red")
plt.scatter(ratios_df_2["n_authors"], ratios_df_2["ratio"], alpha=0.01, color="grey", label="Gender Inequality Index")
plt.title("Are bigger teams more diverse?")
plt.xlabel("Team Size")
plt.ylabel("Gender Inequality Index")
# Anchor a legend outside the plot: https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
plt.legend(bbox_to_anchor=(1.5, 1))

In [None]:
# >>> Interpretation:
# Hypothesis 5 is not correct.