In [None]:
# DATA IMPROVEMENT Part 2
# In this notebook, we find out which score is the minimum score we can trust more or less.
# For that, we will explore the data sets and look at data properties that tell us about the reliability.

In [None]:
# >>> Preparation
import pandas

print("Importing cleaned names for improvement... ")
names = pandas.read_csv("data/names_cat_i2.csv", usecols=["name", "n_publs", "likely_gender", "score", "first_name", "last_name"])
print("Names imported.")

In [None]:
# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print(names[:10])

In [None]:
# >> The last name was identified, but not the first name (names with score < 5)
d = names[pandas.isnull(names['first_name']) & pandas.notnull(names['last_name'])]
# If the last name was not identified but the first name was, most names have a score of max. 4. They are 16 851 names
# and contain many false positives on first view. Eg. A. A. S. Mohamed 	first name =  Mohamed 	 which is probably false
# Harshly said, this means that a score of 4 means a certainty of 0.
# If 4 has a certainty of 0, can we trust a score of 5? Let's find out in the next step.

In [None]:
# >> Can we trust a score of 5?
e = names[names['score'] == 5]
# names of a score of 5 are problematic. Some are abbreviated, some are not.
# This concerns 299 615 names.
# Those that are not abbreviated seem to be classified fine (manual check on about 100 names).
# But are those that are abbreviated? Let's see in the next step

In [None]:
# Get those names that contain an abbreviation
# They mostly have score 2
f = names[names['name'].str.match('\w\.')]

In [None]:
f['score'].hist(bins=f['score'].max())

In [None]:
# Many abbreviations with russian last name are classified reliably, because 
# russian last names can tell the gender (unlike stated in Science Metrix: http://www.science-metrix.com/sites/default/files/science-metrix/publications/science-metrix_bibliometric_indicators_womens_contribution_to_science_report.pdf)
# https://en.wikipedia.org/wiki/List_of_surnames_in_Russia
# Other names have another not abbreviated name
# This concerns 10 213 names
# We can therefore conclude that names with a score of 5 can be classified rather reliably.
f[f['score']>5]

In [None]:
# How many indirect abbreviations do we have? Do they bother the reliability of score 5
# 74, max score = 6 (2 names)
g = names[names['name'].str.match('\w\s')]

In [None]:
# >>Interpretation 
# The minimum score that is reliable is 5. 