In [None]:
# Checking for Outliers in names_cat

# In this notebook we will find out if there are outliers in our authors data set, how many, 
# and whether or not to exclude them from further researc.

In [None]:
# >>> Preparation
# Import categorized 'names_cat.csv'
import pandas

print("Importing names... ")
names = pandas.read_csv("data/names_cat.csv", usecols=["name", "n_publs", "likely_gender", "score"])

# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. They look like this: {}".format(names[:5]))

In [None]:
# Get all data on female and male authors
# For now, Unknown gender is calculated here - later, the dataset we read above will already have those calculated. 
u = names[names['score'] < 5]
not_u = names[names['score'] >= 5]
f = not_u[not_u['likely_gender'] == 'female']
m = not_u[not_u['likely_gender'] == 'male']

In [None]:
# >>> Get statistical information on our data sets

In [None]:
u.describe()

In [None]:
not_u.describe()

In [None]:
f.describe()

In [None]:
m.describe()

In [None]:
# >>> Interpreting results in relation to possible outliers
# Looking at the relative values it appears that there could be some outliers,
# considering the huge difference between mean and median (50%) values and the high std.
# In the following, we will explore the outliers

In [None]:
import matplotlib.pyplot as plt

In [None]:
# >>> A closer look on the distribution
# Histogram for all n_publs values of all names (male and female)
h_n_publs1 = names['n_publs'].hist(alpha=0.5)
plt.xlabel("Amount of Publications")
plt.title("Histogram")

In [None]:
# Zoom in (with limits)
# Set axis limits: https://stackoverflow.com/questions/3777861/setting-y-axis-limit-in-matplotlib

h_n_publs2 = names['n_publs'].hist(alpha=0.5, bins=1694)

axes = plt.gca()
axes.set_xlim([0,50])

plt.xlabel("Amount of Publications")
plt.title("Histogram")

In [None]:
# Zoom in (with limits)
# Set axis limits: https://stackoverflow.com/questions/3777861/setting-y-axis-limit-in-matplotlib

h_n_publs3 = names['n_publs'].hist(alpha=0.5, bins=1694)

axes = plt.gca()
axes.set_xlim([0,10])

plt.xlabel("Amount of Publications")
plt.title("Histogram")

In [None]:
# Zoom in (with limits)
# Set axis limits: https://stackoverflow.com/questions/3777861/setting-y-axis-limit-in-matplotlib

h_m_publs1 = m['n_publs'].hist(density=True, alpha=0.5, bins=1694, label="Male")
h_f_publs1 = f['n_publs'].hist(density=True, alpha=0.5, bins=946, label="Female")
h_u_publs1 = u['n_publs'].hist(density=True, alpha=0.5, bins=946, label="Unknown")

axes = plt.gca()
axes.set_xlim([0,10])

plt.xlabel("Amount of Publications")
plt.legend(title="Gender")
plt.title("Histogram")

In [None]:
# Zoom in (with limits)
# Set axis limits: https://stackoverflow.com/questions/3777861/setting-y-axis-limit-in-matplotlib

h_u_publs1 = u['n_publs'].hist(alpha=0.5, bins=946, label="Unknown")
h_m_publs1 = m['n_publs'].hist(alpha=0.5, bins=1694, label="Male")
h_f_publs1 = f['n_publs'].hist(alpha=0.5, bins=946, label="Female")

axes = plt.gca()
axes.set_xlim([0,10])

plt.xlabel("Amount of Publications")
plt.legend(title="Gender")
plt.title("Histogram")

In [None]:
# >>> Showing possible outliers using boxplot
plt.boxplot([f['n_publs'], m['n_publs'], u['n_publs']], labels=["female", "male", "unknown"])
plt.xlabel("Gender")
plt.ylabel("Amount of Publications")
plt.title("Distribution Statistics: Publications by Gender")
plt.show()
# The boxplot is nearly irrecognizable! Looks like a lot of outliers.

In [None]:
# Not showing possible outliers 
plt.boxplot([f['n_publs'], m['n_publs'], u['n_publs']], labels=["female", "male", "unknown"], showfliers=False, showmeans =True)
plt.xlabel("Gender")
plt.ylabel("Amount of Publications")
plt.title("Distribution Statistics: Publications by Gender (not showing possible outliers)")
plt.show()

In [None]:
# Interpretation:
# Even though there are outliers, they might not be relevant for our analysis. 
# The amount of contributions by women and men should be calculated differently, 
# so that if in a group of 4 collaborating authors 1 was a woman, that counts as 1/4 contribution,
# not 1 contribution.

# This method of counting contributions was developed by Science-Metrix 
# in their publication "Development of bibliometric indicators to measure women’s contribution to scientific publications" 
# (http://www.science-metrix.com/en/publications/reports#/en/gender-report, May 1 19)