In [None]:
# STEP 6: TEST DATA QUALITY for corrected data
# Explore Result Quality Part II. a)
# In this Notebook, a CSV is created to hold information on data statistics depending on the threshold 
# (ie minimum score necessary for a name to be used)

In [None]:
# >>> Preparation
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat_i2.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

In [None]:
print("Importing uncorrected names... ")
names_old = pandas.read_csv("data/names_cat.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names_old = names_old.set_index("name", drop = False)
print("Uncorrected names imported. They look like this: {}".format(names[:5]))

In [None]:
# >>> Get statistics on the data depending on the threshold
names_cor_stats = {
    
}

for score_min in range(0,34):
    current = names[names['score'] >= score_min]
    current_f = current[current['likely_gender'] == "female"]
    current_m = current[current['likely_gender'] == "male"]
    
    previous = names_old[names_old['score'] >= score_min]
    previous_f = previous[previous['likely_gender'] == "female"]
    previous_m = previous[previous['likely_gender'] == "male"]
    
    current_pubs_mean = current['n_publs'].mean()
    current_pubs_f_mean = current_f['n_publs'].mean()
    current_pubs_m_mean = current_m['n_publs'].mean()
    
    current_pubs_median = current['n_publs'].median()
    current_pubs_f_median = current_f['n_publs'].median()
    current_pubs_m_median = current_m['n_publs'].median()
    
    current_pubs = current['n_publs'].sum()
    current_pubs_f = current_f['n_publs'].sum()
    current_pubs_m = current_m['n_publs'].sum()
    
    current_pubs_f_part = current_pubs_f / current_pubs
    
    current_authors = len(current['n_publs'])
    current_authors_f = len(current_f['n_publs'])
    current_authors_m = len(current_m['n_publs'])
    
    current_authors_f_part = current_authors_f / current_authors
    
    previous_pubs_f = previous_f['n_publs'].sum()
    
    previous_authors_f = len(previous_f['n_publs'])

    names_cor_stats[score_min] = {
        'score_min': score_min,
        
        'pubs_mean': current_pubs_mean,
        'pubs_mean_f': current_pubs_f_mean,
        'pubs_mean_m': current_pubs_m_mean,
        
        'pubs_mean_dif': current_pubs_mean - previous['n_publs'].mean(),
        'pubs_mean_f_dif': current_pubs_f_mean - previous_f['n_publs'].mean(),
        'pubs_mean_m_dif': current_pubs_m_mean - previous_m['n_publs'].mean(),
        
        'pubs_median': current_pubs_median,
        'pubs_median_f': current_pubs_f_median,
        'pubs_median_m': current_pubs_m_median,
        
        'pubs_median_dif': current_pubs_median - previous['n_publs'].median(),
        'pubs_median_f_dif': current_pubs_f_median - previous_f['n_publs'].median(),
        'pubs_median_m_dif': current_pubs_m_median - previous_m['n_publs'].median(),
        
        'pubs_total': current_pubs,
        'pubs_f': current_pubs_f,
        'pubs_m': current_pubs_m,
        
        'pubs_f_dif': current_pubs_f - previous_pubs_f,
        'pubs_m_dif': current_pubs_m - previous_m['n_publs'].sum(),
        
        'pubs_f_part': current_pubs_f_part,
        'pubs_f_part_dif': current_pubs_f_part - previous_pubs_f / previous['n_publs'].sum(),
        
        'authors_total': current_authors,
        'authors_f': current_authors_f,
        'authors_m': current_authors_m,
        
        'authors_f_dif': current_authors_f - previous_authors_f,
        'authors_m_dif': current_authors_m - len(previous_m['n_publs']),
        
        'authors_f_part': current_authors_f_part,
        'authors_f_part_dif': current_authors_f_part - previous_authors_f / len(previous['n_publs'])
    }

In [None]:
# Convert to dataframe
names_stats = pandas.DataFrame.from_dict(names_cor_stats, orient="index")

In [None]:
# Save results
print("Saving stats to CSV... ")
names_stats.to_csv("data/names_score_min_stats_i2.csv")
print("Stats saved!")