In [None]:
import pandas

print("Importing categorized names... ")
names = pandas.read_csv("data/names_cat.csv", usecols=["name", "n_publs", "likely_gender", "score"])
print("Names imported.")

In [None]:
# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print(names[:10])

In [None]:
# Go through the names and categorize all with a score of <= 2 as (likely_gender=)uncategorizable. 
# Count how many names these are
# If it's an okay-number, also sort out all with a score of 2<score<=3
# Will Asian names need to be handled separately, or names with one letter? Or is the score categorization sufficient?

In [None]:
names.describe()

In [None]:
# From describe we can see that at least 25% of names have a score of 2 or less. 
# At least 50% have a score of at leat 4 (median). 
# The minimum score is 0 (as expected), the maximum score is 33. 
# the mean is almost 5 with a std of 3.5

# Some more info we get from describe about the number of publications:
# On average, a person publishes 5.7 pieces (with a std of 18!)
# The record for most publications was set with 1694 pieces, wow!
# At least 25% of authors published at least 1 piece,
# At least 50% published at least 2 pieces (median),
# At least 75% published at least 4 pieces.

In [None]:
len(names[names['score'] >= 4])

In [None]:
1359955 / len(names)

In [None]:
# from getting all names with a score lower than or equal 2 we found out: 
# 160377 names have a score of 0
# 233923 names have a score of 1
# 280557 names have a score of 2
# Overall, 674857 names have a score lower or equal 2.

In [None]:
# Graph
# Grouping and aggregating: https://stackoverflow.com/questions/40480744/pandas-group-by-and-count
n_publs_by_score = names.groupby("score").agg({'n_publs':'sum'})
n_publs_by_score = n_publs_by_score.sort_index()

In [None]:
n_authors_by_score = names.groupby("score").size()

In [None]:
import matplotlib.pyplot as plt

# float to int: https://stackoverflow.com/questions/6569528/python-float-to-int-conversion
bins = names['score'].max() / 3
bins = int(round(bins))
bins

In [None]:
hist_score_0 = names['score'].hist(bins=bins)

In [None]:
hist_publs_0_max_10 = names[names['n_publs']<=10]['n_publs'].hist(bins=10)

In [None]:
hist_publs_0 = names['n_publs'].hist()

In [None]:
plt.plot(n_publs_by_score.index.values, n_publs_by_score['n_publs'], 'b', label='Amount of publications by score')
plt.plot(n_authors_by_score.index.values, n_authors_by_score.values, 'g', label='Amount of authors by score')
plt.plot([3, 3], [0, n_publs_by_score['n_publs'].max()], 'r-', lw=2, label="Score = 3")
plt.legend(loc='best')
plt.show()

In [None]:
plt.plot(n_publs_by_score.index.values, n_publs_by_score['n_publs'], 'b_', label='Amount of publications by score')
plt.plot(n_authors_by_score.index.values, n_authors_by_score.values, 'g_', label='Amount of authors by score')
plt.plot([3, 3], [0, n_publs_by_score['n_publs'].max()], 'r-', lw=2, label="Score = 3")
plt.legend(loc='best')
plt.show()

In [None]:
plt.bar(n_publs_by_score.index.values, n_publs_by_score['n_publs'], label='Amount of publications by score', alpha=0.5, color='b')
plt.bar(n_authors_by_score.index.values, n_authors_by_score.values, label='Amount of authors by score', alpha=0.5, color='g')
plt.plot([3, 3], [0, n_publs_by_score['n_publs'].max()], 'r-', lw=2, label="Score = 3")
plt.legend(loc='best')
plt.show()

In [None]:
# Get first name and last name from full name with NamSor
# Calculate the gender + score for the full name for those names with a score less than 7
# Save the new result (gender and score) in the df
# Compare the results. 
# To test if this makes sense: 
# Get sample of names with score less than 7
# send to api
# save results
# row: gender changed?: bool
# how much score changed?: int
# plot