In [None]:
# STEP 4: TEST DATA QUALITY
# Explore Result Quality Part II.
# The results from before will be analyzed more structuredly
# This notebook looks at the distribution of the score value in our dataset

In [None]:
# >>> Preparation
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat_i1.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

In [None]:
f = names[names['likely_gender'] == 'female']
m = names[names['likely_gender'] == 'male']

In [None]:
# Get bin sizes for histograms
bins_f = f['score'].max()
bins_m = m['score'].max()

In [None]:
# >>> Visualization
import matplotlib.pyplot as plt

f['score'].hist(density="True", label="Female", bins=bins_f)
m['score'].hist(density="True", alpha=0.5, label="Male", bins=bins_m)

plt.xlabel("Score")
plt.title("Histogram: Which gender has which score how often (normalized)?")
plt.legend()

In [None]:
f['score'].hist(label="Female", bins=bins_f)
m['score'].hist(alpha=0.5, label="Male", bins=bins_m)

plt.xlabel("Score")
plt.title("Histogram: Which gender has which score how often?")
plt.legend()

In [None]:
# >>> Evaluation:
# 1. The score value is not distributed equally for men and women.
# 2. Women's scores are lower than men's scores. 
# We see more female classifications than male classifications if looking at data with a score between 0 and 6., 
# 3. The threshold we chose will influence our results. 

# 4. We learn about the NamSor API:
# meaning the API is more certain when classifying a name as male, 
# than it is when classifying a name as female.

# >>> Consequence:
# We shall look into this further by investigating how the threshold we chose influences our results.