In [None]:
# DRAFT

In [None]:
import pandas

print("Importing names to test for improvement... ")
names = pandas.read_csv("data/names_cat.csv", usecols=["name", "n_publs", "likely_gender", "score", "clean_name", "first_name", "last_name", "likely_gender_2", "score_2", "gender_dif", "score_dif"])
print("Names imported.")

In [None]:
# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print(names[:10])

In [None]:
print("Getting private key... ")
# Get private API Key for NamSor API v2 (contained in txt file)
key = ''

# Import personal key
with open("key.txt", "r") as file:
    key = file.read()

if(len(key) > 0):
    print("Got private key.")
else: 
    print("Could not find private key. Please check the file name and make sure you have an API key.")

In [None]:
# Setting up NamSor API v2 to get the gender of a name
# https://www.namsor.com/
# https://v2.namsor.com/NamSorAPIv2/apidoc.html
# using NamSor API v2 Python SDK
# https://github.com/namsor/namsor-python-sdk2
# licensed under GNU Affero General Public License v3.0

# Following script partly taken from https://github.com/namsor/namsor-python-sdk2 "Getting Started" 
# and adapted to keep key private and remove unnecessary lines.

print("Setting up NamSor API v2 connection settings...")

import openapi_client
from openapi_client.rest import ApiException

# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = key
# create an instance of the API class
pers_api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

In [None]:
names_to_consider = names.copy()
print("Will be parsing and reconsidering {} names.".format(len(names_to_consider)))

In [None]:
# Formatting the names using the API's models
def createPersonalNameIn(name_entry):
    return openapi_client.PersonalNameIn(id=name_entry['name'], name=name_entry['clean_name'])

In [None]:
# Now parsing the full names into first and last name, sending in
# one batch at a time and saving the result answer by answer.
# If the API calling gets interrupted:
# 1. check that no names got lost: ((len(names_to_consider)-len(result)) == len(names_stack))
# 2. If True: Restart only the code in the while loop.

batch_size = 1000 #1000 is the API limit given by NamSor
start = 0
end = batch_size
result = []

names_stack = names_to_consider[['name', 'clean_name']].to_dict('records')

print(names_stack[:10])

In [None]:
'''while (len(names_stack) >= batch_size):
    try:
        current_batch = list(map(createPersonalNameIn, names_stack[start:end])) # create batch of names in correct format
        batch_personal_name_in = openapi_client.BatchPersonalNameIn(personal_names=current_batch) # convert batch to correct format
        api_response = pers_api_instance.parse_name_batch(batch_personal_name_in=batch_personal_name_in) # call API
        result = result + api_response.personal_names # save result
        
        del names_stack[start:end] # delete the names that have already been categorized from the stack
        
        # categorize remaining names if they are less than a batch size
        if(len(names_stack) < batch_size and len(names_stack) > 0):
            current_batch = list(map(createPersonalNameIn, names_stack)) # create the batch of remaining names
            batch_personal_name_in = openapi_client.BatchPersonalNameIn(personal_names=current_batch)
            api_response = pers_api_instance.parse_name_batch(batch_personal_name_in=batch_personal_name_in)
            result = result + api_response.personal_names
            names_stack = [] # empty the stack
        
        print("Batch of names analyzed")
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_full_batch: %s\n" % e)


print("All batches analyzed.")'''

In [None]:
# Formatting the names using the API's models
import math

def createParsedPersonalNameIn(names_entry):
    if(names_entry['first_name'] is None or names_entry['last_name'] is None):
        return None
    return openapi_client.FirstLastNameIn(id=names_entry['name'], first_name=names_entry['first_name'], last_name=names_entry['last_name'])

In [None]:
# Now parsing the full names into first and last name, sending in
# one batch at a time and saving the result answer by answer.
# If the API calling gets interrupted:
# 1. check that no names got lost: ((len(names_to_consider)-len(result)) == len(names_stack))
# 2. If True: Restart only the code in the while loop.

batch_size = 1000 #1000 is the API limit given by NamSor
start = 0
end = batch_size
result = []

names_stack = list(names_to_consider[['name', 'first_name', 'last_name']].to_dict('records'))
names_stack = list(filter(lambda x: x['first_name'] is not None and x['last_name'] is not None, names_stack))

In [None]:
# Call API to find out gender but ALSO pass info on country/origin
'''while (len(names_stack) >= batch_size):
    try:
        current_batch = list(map(createParsedPersonalNameIn, names_stack[start:end])) # create batch of names in correct format
        batch_first_last_name_in = openapi_client.BatchFirstLastNameIn(personal_names=current_batch) # convert batch to correct format
        api_response = pers_api_instance.gender_batch(batch_first_last_name_in=batch_first_last_name_in) # call API
        result = result + api_response.personal_names # save result
        
        del names_stack[start:end] # delete the names that have already been categorized from the stack
        
        # categorize remaining names if they are less than a batch size
        if(len(names_stack) < batch_size and len(names_stack) > 0):
            current_batch = list(map(createParsedPersonalNameIn, names_stack)) # create the batch of remaining names
            batch_first_last_name_in = openapi_client.BatchFirstLastNameIn(personal_names=current_batch)
            api_response = pers_api_instance.gender_batch(batch_first_last_name_in=batch_first_last_name_in)
            result = result + api_response.personal_names
            names_stack = [] # empty the stack
        
        print("Batch of names analyzed")
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_full_batch: %s\n" % e)

if(len(names_stack) != 0):
    try:
        current_batch = list(map(createParsedPersonalNameIn, names_stack)) # create the batch of remaining names
        batch_first_last_name_in = openapi_client.BatchFirstLastNameIn(personal_names=current_batch)
        api_response = pers_api_instance.gender_batch(batch_first_last_name_in=batch_first_last_name_in)
        result = result + api_response.personal_names
        names_stack = [] # empty the stack
        
        print("Batch of names analyzed")
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_full_batch: %s\n" % e)

print("All batches analyzed. Returned {} results.".format(len(results)))'''

In [None]:
# Convert results (list of openapi_client.models.personal_name_gendered_out.PersonalNameGenderedOut) to (list of dictionaries)
print('Filling the results into the names dataframe...')
for oapi_el in result:
    names_to_consider.at[oapi_el.id, 'likely_gender_3'] = oapi_el.likely_gender
    names_to_consider.at[oapi_el.id, 'score_3'] = round(oapi_el.score)
    gender_dif = (names_to_consider.at[oapi_el.id, 'likely_gender'] != names_to_consider.at[oapi_el.id, 'likely_gender_3'])
    if (gender_dif):
        g_val = 1
    names_to_consider.at[oapi_el.id, 'gender_dif_2'] = gender_dif
    names_to_consider.at[oapi_el.id, 'score_dif_2'] = (names_to_consider.at[oapi_el.id, 'score_3'] - names_to_consider.at[oapi_el.id, 'score'])

print('Dataframe completed with API results. Here is a sample: {}'.format(names_to_consider[:50]))

In [None]:
print("Saving test names...")
names_to_consider.to_csv("data/names_improvement_test.csv")
print("Test names saved!")

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x=names_to_consider['gender_dif_2'], y=names_to_consider['score_3'])

In [None]:
names_to_consider['score_3'].hist(bins=7)

In [None]:
names_to_consider['score'].hist(bins=3)

In [None]:
names_to_consider['score_dif_2'].hist()

In [None]:
names_by_score_dif = names_to_consider.groupby(['score_dif_2', 'gender_dif_2'])

In [None]:
names_by_score_dif = names_by_score_dif.agg({'score':'count'}) # score just counts how many entries there are per score_dif and gender_dif

In [None]:
score_diffs = list(map(lambda x: x[0], names_by_score_dif.index.values))
gender_diffs = list(map(lambda x: x[1], names_by_score_dif.index.values))

In [None]:
plt.scatter(x=score_diffs, y=names_by_score_dif['score'], c=gender_diffs, alpha=0.7)

In [None]:
descriptive_statistics_improved_names_sample2 = names_to_consider.describe()
descriptive_statistics_improved_names_sample2

In [None]:
print("Saving test names...")
descriptive_statistics_improved_names_sample2.to_csv("data/names_improvement_test_statistics.csv")
print("Test names saved!")

In [None]:
gender_changed = names_to_consider[names_to_consider['gender_dif_2'] == True]

In [None]:
gender_changed.describe()

In [None]:
gender_changed_more_certain = gender_changed[gender_changed['score_dif_2'] > 0]

In [None]:
gender_changed_more_certain.describe()

In [None]:
score_improved = names_to_consider[names_to_consider['score_dif_2'] > 0]

In [None]:
score_improved.describe()

In [None]:
# The gender assumption changed for only 100 of 1000 entries 
# (assuming the gender change is valid if the new score is higher than the old score, which is the case in 44 cases)

# The score improves on average by 0.48 with a std of 1.21. 
# The mean improvement is 0, the 25% quartile is 0, the 75% quartile is 1.
# The score is at maximum improved by 7 and in the worst case decreased by 2.