In [None]:
# DATA IMPROVEMENT Part I
# Get the first and last name of each name that has a score of 4 or less
# so we can improve the results using a more precise API endpoint

In [None]:
# >>> Preparation
import pandas

print("Importing names for improvement... ")
names = pandas.read_csv("../../_data/names_cat.csv", usecols=["name", "n_publs", "likely_gender", "score"])
print("Names imported.")

In [None]:
# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print(names[:10])

In [None]:
print("Getting private key... ")
# Get private API Key for NamSor API v2 (contained in txt file)
key = ''

# Import personal key
with open("../../key.txt", "r") as file:
    key = file.read()

if(len(key) > 0):
    print("Got private key.")
else: 
    print("Could not find private key. Please check the file name and make sure you have an API key.")

In [None]:
# Setting up NamSor API v2 to get the gender of a name
# https://www.namsor.com/
# https://v2.namsor.com/NamSorAPIv2/apidoc.html
# using NamSor API v2 Python SDK
# https://github.com/namsor/namsor-python-sdk2
# licensed under GNU Affero General Public License v3.0

# Following script partly taken from https://github.com/namsor/namsor-python-sdk2 "Getting Started" 
# and adapted to keep key private and remove unnecessary lines.

print("Setting up NamSor API v2 connection settings...")

import openapi_client
from openapi_client.rest import ApiException

# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = key
# create an instance of the API class
pers_api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

In [None]:
# Cleaning functions
import re

def clean(x):
    "This function cleans up a string"
    # Strip numbers from a string: https://stackoverflow.com/questions/16849109/strip-out-numbers-from-a-string
    # Remove white space at end from string: https://stackoverflow.com/questions/2372573/how-do-i-remove-whitespace-from-the-end-of-a-string-in-python

    return pandas.Series([x, re.sub(r'\d+','',x).rstrip()], index=['name', 'clean_name'])

In [None]:
def clean_names(names):
    "This function cleans up multiple names"
    
    cleaned_names = names['name'].apply(clean)
    cleaned_names = cleaned_names.set_index('name')

    return names.merge(cleaned_names, how='left', left_index=True, right_index=True)

In [None]:
# Find out which names to reconsider for improving results
names_to_split = clean_names(names[names['score']<5])
print("Will be parsing and reconsidering {} names.".format(len(names_to_split)))

In [None]:
# Formatting the names using the API's models
def createPersonalNameIn(name_entry):
    return openapi_client.PersonalNameIn(id=name_entry['name'], name=name_entry['clean_name'])

In [None]:
# Formatting the names using the API's models
def format_full_clean_name(name_entry):
    "This function formats a full clean name. It takes a full name and returns a PersonalNameIn"
    return openapi_client.PersonalNameIn(id=name_entry['name'], name=name_entry['clean_name'])

In [None]:
def format_full_clean_names(li):
    "This function formats multiple full clean names. It takes a list of unformatted full names and returns a list of formatted full names."
    return list(map(format_full_clean_name, li))

In [None]:
def format_full_name_batch(li):
    "This function formats a batch of formatted full clean names. It takes a list of formatted full names and returns a formatted batch."
    return openapi_client.BatchPersonalNameIn(personal_names=li)

In [None]:
def parsename_batch(batch):
    "This function calls the API. It takes a formatted batch of full names and returns the API response."
    return pers_api_instance.parse_name_batch(batch_personal_name_in=batch)

In [None]:
def call_api_parsename_batch(li):
    "This function prepares a list of unformatted clean names for the API call and then calls the API calling function. It returns the API's name classifications."
    current_batch = format_full_clean_names(li)  # format the names
    batch_personal_name_in = format_full_name_batch(current_batch)# format the batch
    api_response =  parsename_batch(batch_personal_name_in)# call api
    return api_response.personal_names # return result

In [None]:
batch_size = 1000 #1000 is the API limit given by NamSor
start = 0
end = batch_size
result = []

names_stack = names_to_split[['name', 'clean_name']].to_dict('records')

print('Will need to make {} calls.'.format(len(names_stack) / batch_size))

In [None]:
# >>> CALL API
# Now parsing the full names into first and last name, sending in
# one batch at a time and saving the result answer by answer.
while (len(names_stack) >= batch_size):
    try:
        result = result + call_api_parsename_batch(names_stack[start:end])
        del names_stack[start:end]
        print("Batch of names analyzed. {} names left.".format(len(names_stack)))
        
        if(len(names_stack) < batch_size and len(names_stack) > 0):
            result = result + call_api_parsename_batch(names_stack)
            names_stack = []
            print("Batch of names analyzed. {} names left.".format(len(names_stack)))
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_full_batch: {}".format(e))
        if((len(list(names_to_split.index.values))-len(result)) == len(names_stack)): #check that no names got lost
            print("No names got lost. Trying again with stack size {}...".format(len(names_stack)))
            continue
        else:
            raise(e)

if(len(names_stack) != 0):
    result = result + call_api_parsename_batch(names_stack)
    names_stack = []
    print("Batch of names analyzed. {} names left.".format(len(names_stack)))

print("All batches analyzed.")

In [None]:
# >>> Save results
# Convert results (list of openapi_client.models.PersonalNameParsedOut) to (list of dictionaries)
print('Filling the results into the names dataframe...')
for oapi_el in result:
    name = oapi_el.first_last_name
    try:
        names.at[oapi_el.id, 'first_name'] = name.first_name
        names.at[oapi_el.id, 'last_name'] = name.last_name
    except:
        names.at[oapi_el.id, 'first_name'] = None
        names.at[oapi_el.id, 'last_name'] = None
    
print('Dataframe completed with API results. Here is a sample: {}'.format(names[:5]))

In [None]:
print("Saving parsed names...")
names.to_csv("../../_data/names_cat_i1.csv")
print("Parsed names saved!")