In [None]:
# HYPOTHESIS TESTING
# Preparing the Data

In [None]:
# Hypothesis 1: 
# Overall, there are more male than female authors. 

# Hypothesis 2: 
# In total, more has been written by men than by women.

In [None]:
# >>> Import
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat_i2.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

In [None]:
print("Importing publications... ")
cols = [
    'key', 
    'pub_cat',
    'pub_type',
    'title',
    'year',
    'authors',
    'n_authors',
    'n_males',
    'n_females',
    'n_unknown',
    'ratio',
    'n_contribs_females',
    'n_contribs_males',
    'n_contribs_unknown'
]

publs = pandas.read_csv("data/publications_cor3.csv", usecols=cols)

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
publs = publs.set_index("key", drop = False)
print("Publications imported. They look like this: {}".format(publs[:5]))

In [None]:
# >>> Calculate statistics

In [None]:
# Calculate amount of contributions
n_contribs_f_max = publs['n_contribs_females'].sum()
n_contribs_m_max = publs['n_contribs_males'].sum()
n_contribs_u_min = publs['n_contribs_unknown'].sum()

In [None]:
# Calculate amount of publications contributed to V2
n_contribs_f_max_v2 = names[(names['score'] >= 5) & (names['likely_gender'] == 'female')]['n_publs'].sum()
n_contribs_m_max_v2 = names[(names['score'] >= 5) & (names['likely_gender'] == 'male')]['n_publs'].sum()
n_contribs_u_min_v2 = names[(names['score'] < 5)]['n_publs'].sum()

In [None]:
# Calculate amount of authors
n_authors_f_min= names[(names['score'] >= 5) & (names['likely_gender'] == 'female')].count()['likely_gender']
n_authors_m_min = names[(names['score'] >= 5) & (names['likely_gender'] == 'male')].count()['likely_gender']
n_authors_u_max = names[(names['score'] < 5)].count()['likely_gender']

In [None]:
# If we assume that all names with a score lower than 5 have been classified at random with 50% chance of being correct, 
# the case where most female authors exist is if all female names have been classified correctly and n_male_names - ((n_names / 2) - n_female_names) male names. 
# All other names would then mistakenly have been classified as male and should be assumed to be female, for this scenario.
n_all = names[(names['score'] < 5)]["name"].count()
n_f = names[(names['score'] < 5) & (names['likely_gender'] == 'female')]["name"].count()
n_m = names[(names['score'] < 5) & (names['likely_gender'] == 'male')]["name"].count()

n_authors_m_false = n_m - ((n_all / 2) - n_f)

n_authors_m_correct = n_m - n_authors_m_false
n_authors_f_correct = n_f + n_authors_m_false

print("In the best case, in the sample of names with a score < 5, there are {} female and {} male names.".format(n_authors_f_correct, n_authors_m_correct))

In [None]:
# Add up the best case scenario for names with a score <5 to the names with a score of 5
n_f_cor = n_authors_f_min + n_authors_f_correct
n_m_cor = n_authors_m_min + n_authors_m_correct

In [None]:
# >>> Save statistics to dict
stats = {
    1: {
        'n_authors_f_min': n_authors_f_min,
        'n_authors_m_min': n_authors_m_min,
        'n_authors_u_max': n_authors_u_max,
        
        'n_authors_f_cor': n_f_cor,
        'n_authors_m_cor': n_m_cor,
        
        'n_contribs_f_max': n_contribs_f_max,
        'n_contribs_m_max': n_contribs_m_max,
        'n_contribs_u_min': n_contribs_u_min,
        
        'n_contribs_f_max_v2': n_contribs_f_max_v2,
        'n_contribs_m_max_v2': n_contribs_m_max_v2,
        'n_contribs_u_min_v2': n_contribs_u_min_v2
    }
}

In [None]:
# Convert results into dataframe
# Dict to dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.from_dict.html
print("Converting dictionary of stats to a dataframe...")
stats_df = pandas.DataFrame.from_dict(stats)
print("Dataframe completed! Here are some entries: {}".format(stats_df))

In [None]:
# Save Dataframe to CSV: https://riptutorial.com/pandas/example/19502/create-random-dataframe-and-write-to--csv
print("Saving dataframe as CSV...")
stats_df.to_csv("data/stats_authors_contribs_by_gender_cor1.csv")
print("Stats saved as CSV!")