In [None]:
# HYPOTHESIS TESTING

In [None]:
# Hypothesis 3:
# A male author is equally productive as a female author.

In [None]:
# >>> Preparation
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat_i3.csv", usecols=["name", "n_publs", "likely_gender", "score", "n_contribs"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

In [None]:
# Import of libraries
from statistics import mode
import numpy
import matplotlib.pyplot as plt

In [None]:
# Outlier calculation
print("Calculating from what point on a team is an outlier, considering its size...")
import numpy as np
lower = np.percentile(names["n_contribs"], 25)
higher = np.percentile(names["n_contribs"], 75)
interquart_range = higher - lower
outer_fence = higher + interquart_range * 3
print("The outer fence is {}".format(outer_fence))

In [None]:
# Get data to analyze
regular_authors = names[names["n_contribs"] <= outer_fence]
regular_authors_max = regular_authors[regular_authors["score"] >= 5]

In [None]:
# >>> Descriptive statistics
n_contribs_mean = regular_authors["n_contribs"].mean()
n_contribs_median = np.median(regular_authors["n_contribs"]) # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.median.html
n_contribs_mode = mode(regular_authors["n_contribs"])

n_contribs_mean_by_gender = regular_authors_max.groupby("likely_gender").agg({'n_contribs':'mean'}) # Mean by gender
n_contribs_mean_by_gender["likely_gender"] = n_contribs_mean_by_gender.index.values

n_contribs_median_by_gender = regular_authors_max.groupby("likely_gender").agg({'n_contribs':'median'}) # Median by gender
n_contribs_median_by_gender["likely_gender"] = n_contribs_median_by_gender.index.values

n_contribs_mode_by_gender = regular_authors_max.groupby("likely_gender").agg({'n_contribs': mode}) # Mode by gender
n_contribs_mode_by_gender["likely_gender"] = n_contribs_mode_by_gender.index.values

n_contribs_f_mean = n_contribs_mean_by_gender["n_contribs"]["female"]
n_contribs_f_median = n_contribs_median_by_gender["n_contribs"]["female"]
n_contribs_f_mode = n_contribs_mode_by_gender["n_contribs"]["female"]

n_contribs_m_mean = n_contribs_mean_by_gender["n_contribs"]["male"]
n_contribs_m_median = n_contribs_median_by_gender["n_contribs"]["male"]
n_contribs_m_mode = n_contribs_mode_by_gender["n_contribs"]["male"]

n_contribs_u_mean = regular_authors[regular_authors["score"]<5]["n_contribs"].mean() # mean for unknown authors
n_contribs_u_median = regular_authors[regular_authors["score"]<5]["n_contribs"].median() # median for male authors
n_contribs_u_mode = mode(regular_authors[regular_authors["score"]<5]["n_contribs"]) # mode for male authors

contribs_fmax = regular_authors[(regular_authors["likely_gender"] == "female") | (regular_authors["score"]<5)]

n_contribs_fmax_mean = contribs_fmax["n_contribs"].mean()
n_contribs_fmax_median = contribs_fmax["n_contribs"].median()
n_contribs_fmax_mode = mode(contribs_fmax["n_contribs"])

print("On average, an author of unknown gender makes: mean={}, median={} and mode={} contributions".format(n_contribs_u_mean, n_contribs_u_median, n_contribs_u_mode))
print("On average, a female author makes: mean={}, median={} and mode={} contributions".format(n_contribs_f_mean, n_contribs_f_median, n_contribs_f_mode))
print("On average, a male author makes: mean={}, median={} and mode={} contributions".format(n_contribs_m_mean, n_contribs_m_median, n_contribs_m_mode))
print("On average, a potentially female author makes: mean={}, median={} and mode={} contributions".format(n_contribs_fmax_mean, n_contribs_fmax_median, n_contribs_fmax_mode))

In [None]:
# >>> Hypothesis Testing
# Which hypothesis test to use?
from scipy.stats import skew
f = regular_authors[regular_authors["likely_gender"] == "female"]["n_contribs"]
m = regular_authors[regular_authors["likely_gender"] == "male"]["n_contribs"]
print("Skewness: m: {} f: {}".format(skew(m), skew(f)))
# The skewness shows that the data is not normally distributed, and thus the hypothesis test is to be done with mann whitney u 
# https://www.jstor.org/stable/2236101?read-now=1&refreqid=excelsior%3Ad37490a1fce1c6303bc555f6a6d4638e&seq=1#page_scan_tab_contents
# https://www.methodenberatung.uzh.ch/de/datenanalyse_spss/unterschiede/zentral/mann.html

In [None]:
# Mann Whitney U Test
from scipy.stats import mannwhitneyu
# Ranking not necessary, is done by SciPy.
U, p = mannwhitneyu(f, m, alternative="two-sided")
# U = number of times that m rank is before an f rank
print("U = {}, p = {}".format(U, p))
# p < 0.05

In [None]:
#r = abs(z / math.sqrt(n)) # effektstärke https://www.methodenberatung.uzh.ch/de/datenanalyse_spss/unterschiede/zentral/mann.html

In [None]:
# >>> Visualize
plt_mean = plt.plot(['male', 'female', 'possibly female', 'unknown'], [n_contribs_mean, n_contribs_mean, n_contribs_mean, n_contribs_mean], 'r--', alpha=0.5, label='Overall Mean')
plt_g_mean = plt.plot(['male','female', 'possibly female', 'unknown'], [n_contribs_m_mean, n_contribs_f_mean, n_contribs_fmax_mean, n_contribs_u_mean], 'rx', label='Mean by Gender')

plt_median = plt.plot(['male', 'female', 'possibly female', 'unknown'],[n_contribs_median, n_contribs_median, n_contribs_median, n_contribs_median], 'b--', alpha=0.5, label='Overall Median')
plt_g_median = plt.plot(['male', 'female', 'possibly female', 'unknown'],[n_contribs_m_median, n_contribs_f_median, n_contribs_fmax_median, n_contribs_u_median], 'bx', label='Median by Gender')

plt_mode = plt.plot(['male', 'female', 'possibly female', 'unknown'],[n_contribs_mode, n_contribs_mode, n_contribs_mode, n_contribs_mode], 'g--', alpha=0.5, label='Overall Mode')
plt_g_mode = plt.plot(['male', 'female', 'possibly female', 'unknown'],[n_contribs_m_mode, n_contribs_f_mode, n_contribs_fmax_mode,n_contribs_u_mode], 'gx', label='Mode by Gender')


plt.xlabel('Gender')
plt.ylabel('Amount of contributions made')

plt.title("How many contributions did authors make on average (by gender)?")

plt.legend(bbox_to_anchor=(1.1, 1))
plt.show()

In [None]:
# >>> Evaluation
# Hypothesis 3 is not correct. According to the diagram a male author is more productive than a female author.