In [None]:
# HYPOTHESIS TESTING

In [None]:
# Hypothesis 4:
# A male author is equally productive as a female author.

In [3]:
# >>> Preparation
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("../_data/names_cat_i3.csv", usecols=["name", "n_publs", "likely_gender", "score", "n_contribs"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

Importing names... 
Names imported. They look like this:                                         name likely_gender  n_publs  score  \
name                                                                         
'Maseka Lesaoana            'Maseka Lesaoana        female        2      8   
(David) Jing Dai            (David) Jing Dai          male        1      7   
(Max) Zong-Ming Cheng  (Max) Zong-Ming Cheng          male        2      8   
(Sophy) Shu-Jiun Chen  (Sophy) Shu-Jiun Chen        female        2      7   
(Zhou) Bryan Bai            (Zhou) Bryan Bai          male        2      6   

                       n_contribs  
name                               
'Maseka Lesaoana         0.583333  
(David) Jing Dai         0.200000  
(Max) Zong-Ming Cheng    0.297619  
(Sophy) Shu-Jiun Chen    0.642857  
(Zhou) Bryan Bai         1.000000  


In [4]:
# Import of libraries
from statistics import mode
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [5]:
# Get data to analyze
regular_authors = names[names["score"] >= 5]

In [6]:
# >>> Descriptive statistics
n_contribs_mean = regular_authors["n_contribs"].mean()
n_contribs_median = np.median(regular_authors["n_contribs"]) # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.median.html
n_contribs_mode = mode(regular_authors["n_contribs"])

n_contribs_mean_by_gender = regular_authors.groupby("likely_gender").agg({'n_contribs':'mean'}) # Mean by gender
n_contribs_mean_by_gender["likely_gender"] = n_contribs_mean_by_gender.index.values

n_contribs_median_by_gender = regular_authors.groupby("likely_gender").agg({'n_contribs':'median'}) # Median by gender
n_contribs_median_by_gender["likely_gender"] = n_contribs_median_by_gender.index.values

n_contribs_mode_by_gender = regular_authors.groupby("likely_gender").agg({'n_contribs': mode}) # Mode by gender
n_contribs_mode_by_gender["likely_gender"] = n_contribs_mode_by_gender.index.values

n_contribs_f_mean = n_contribs_mean_by_gender["n_contribs"]["female"]
n_contribs_f_median = n_contribs_median_by_gender["n_contribs"]["female"]
n_contribs_f_mode = n_contribs_mode_by_gender["n_contribs"]["female"]

n_contribs_m_mean = n_contribs_mean_by_gender["n_contribs"]["male"]
n_contribs_m_median = n_contribs_median_by_gender["n_contribs"]["male"]
n_contribs_m_mode = n_contribs_mode_by_gender["n_contribs"]["male"]

print("On average, an author makes: mean={}, median={} and mode={} contributions".format(n_contribs_mean, n_contribs_median, n_contribs_mode))
print("On average, a female author makes: mean={}, median={} and mode={} contributions".format(n_contribs_f_mean, n_contribs_f_median, n_contribs_f_mode))
print("On average, a male author makes: mean={}, median={} and mode={} contributions".format(n_contribs_m_mean, n_contribs_m_median, n_contribs_m_mode))


On average, an author makes: mean=2.141247911241926, median=0.5 and mode=0.3333333333333333 contributions
On average, a female author makes: mean=1.3841903867560146, median=0.5 and mode=0.3333333333333333 contributions
On average, a male author makes: mean=2.239408609184296, median=0.5428571428571429 and mode=0.3333333333333333 contributions


In [7]:
# >>> Hypothesis Testing
# Which hypothesis test to use?
print("Calculating Skewness to make sure the data is not normally distributed! Else we could use an easier test...")
from scipy.stats import skew
f = regular_authors[regular_authors["likely_gender"] == "female"]["n_contribs"]
m = regular_authors[regular_authors["likely_gender"] == "male"]["n_contribs"]
print("Skewness: m: {} f: {}".format(skew(m), skew(f)))
# The skewness shows that the data is not normally distributed, and thus the hypothesis test is to be done with mann whitney u 
# https://www.jstor.org/stable/2236101?read-now=1&refreqid=excelsior%3Ad37490a1fce1c6303bc555f6a6d4638e&seq=1#page_scan_tab_contents
# https://www.methodenberatung.uzh.ch/de/datenanalyse_spss/unterschiede/zentral/mann.html

Calculating Skewness to make sure the data is not normally distributed! Else we could use an easier test...
Skewness: m: 12.025294019009115 f: 15.87544046365454


In [8]:
# Mann Whitney U Test
from scipy.stats import mannwhitneyu
# Ranking not necessary, is done by SciPy.
U, p = mannwhitneyu(f, m, alternative="two-sided")
# U = number of times that m rank is before an f rank
print("Test returned U = {}, two sided p value= {}".format(U, p))
# p < 0.05

Test returned U = 69084463785.5, two sided p value= 0.0


In [None]:
# >>> Evaluation
# Hypothesis 4 is not correct. According to the table a male author is more productive than a female author.