In [None]:
# DATA IMPROVEMENT Part 1 b)
# In this notebook, the classification for all names with a score < 5 is improved
# by using the API endpoint taking the first and last name (not full name)
# The first and last name was calculated using the API in the step before.

In [None]:
# >>> Preparation
import pandas

print("Importing cleaned names for improvement... ")
names = pandas.read_csv("data/names_cat_i1.csv", usecols=["name", "n_publs", "likely_gender", "score", "first_name", "last_name"])
print("Names imported.")

In [None]:
# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print(names[:10])

In [None]:
print("Getting private key... ")
# Get private API Key for NamSor API v2 (contained in txt file)
key = ''

# Import personal key
with open("key.txt", "r") as file:
    key = file.read()

if(len(key) > 0):
    print("Got private key.")
else: 
    print("Could not find private key. Please check the file name and make sure you have an API key.")

In [None]:
# Setting up NamSor API v2 to get the gender of a name
# https://www.namsor.com/
# https://v2.namsor.com/NamSorAPIv2/apidoc.html
# using NamSor API v2 Python SDK
# https://github.com/namsor/namsor-python-sdk2
# licensed under GNU Affero General Public License v3.0

# Following script partly taken from https://github.com/namsor/namsor-python-sdk2 "Getting Started" 
# and adapted to keep key private and remove unnecessary lines.

print("Setting up NamSor API v2 connection settings...")

import openapi_client
from openapi_client.rest import ApiException

# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = key
# create an instance of the API class
pers_api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

In [None]:
# Cleaning function
import re

def clean_name(name):
    "This function removes brackets and abbreviations from a name"
    if(type(name) is float or name is None):
        return None
    result = re.sub(r'\.|\(|\)+','',name).rstrip( ) # Clean first name
    parts = result.split(' ') #split first name in single parts
    useful_parts = list(filter(lambda x: len(x) > 1, parts)) #Keep only parts that are not an abbreviation
    
    if(len(useful_parts) < 1): 
        return None
    
    return ' '.join(useful_parts).rstrip()

In [None]:
names['first_name'] = names['first_name'].apply(clean_name) # apply cleaning function to all first names

In [None]:
# Correct by hand
names.at['(Max) Zong-Ming Cheng', 'first_name'] = 'Max Zong-Ming'
names.at['(Max) Zong-Ming Cheng', 'last_name'] = 'Cheng'

In [None]:
names['last_name'] = names['last_name'].apply(clean_name) # apply cleaning function to all last names

In [None]:
# Formatting the names using the API's models
import math

def format_split_name(names_entry):
    "This function formats a split name. It takes a full name and returns a FirstLastNameIn (split name)"
    return openapi_client.FirstLastNameIn(id=names_entry['name'], first_name=names_entry['first_name'], last_name=names_entry['last_name'])

In [None]:
def format_split_names(li):
    "This function formats multiple split names. It takes a list of unformatted full names and returns a list of formatted split names."
    return list(map(format_split_name, li))

In [None]:
def format_split_name_batch(li):
    "This function formats a batch of formatted split names. It takes a list of formatted split names and returns a formatted batch."
    return openapi_client.BatchFirstLastNameIn(personal_names=li)

In [None]:
def splitname_batch(batch):
    "This function calls the API. It takes a formatted batch of split names and returns the API response."
    return pers_api_instance.gender_batch(batch_first_last_name_in=batch)

In [None]:
def call_api_splitname_batch(li):
    "This function prepares a list of unformatted names for the API call and then calls the API calling function. It returns the API's name classifications."
    current_batch = format_split_names(li)  # format the names
    batch_personal_name_in = format_split_name_batch(current_batch)# format the batch
    api_response =  splitname_batch(batch_personal_name_in)# call api
    return api_response.personal_names # return result

In [None]:
# Now parsing the full names into first and last name, sending in
# one batch at a time and saving the result answer by answer.

batch_size = 1000 #1000 is the API limit given by NamSor
start = 0
end = batch_size
result = []

names_to_consider = names[names['score'] < 5]

names_stack = list(names_to_consider[['name', 'first_name', 'last_name']].to_dict('records'))
names_stack = list(filter(lambda x: x['first_name'] is not None and x['last_name'] is not None, names_stack))

print('Will need to make {} calls.'.format(len(names_stack) / batch_size))

In [None]:
# >>> Call API: Classify pairs of first and last names
while (len(names_stack) >= batch_size):
    try:
        result = result + call_api_splitname_batch(names_stack[start:end])
        del names_stack[start:end]
        print("Batch of names analyzed. {} names left.".format(len(names_stack)))
        
        # categorize remaining names if they are less than a batch size
        if(len(names_stack) < batch_size and len(names_stack) > 0):
            result = result + call_api_splitname_batch(names_stack)
            names_stack = [] # empty the stack
            print("Batch of names analyzed. {} names left.".format(len(names_stack)))
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_first_last_name_batch: {}".format(e))

if(len(names_stack) != 0):
    try:
        result = result + call_api_splitname_batch(names_stack)
        names_stack = [] # empty the stack
        print("Batch of names analyzed. {} names left.".format(len(names_stack)))
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_first_last_name_batch: {}".format(e))

print("All batches analyzed. Returned {} results.".format(len(result)))

In [None]:
# Convert results (list of openapi_client.models.personal_name_gendered_out.PersonalNameGenderedOut) to (list of dictionaries)
print('Filling the results into the names dataframe...')
for oapi_el in result:
    new_score = round(oapi_el.score)
    if(new_score < 5): # consider a score of less than 5 as too unsure
        new_gender = "unknown"
    else:
        new_gender = oapi_el.likely_gender
    
    names.at[oapi_el.id, 'score'] = new_score
    names.at[oapi_el.id, 'likely_gender'] = new_gender
print('Dataframe completed with API results. \n Here is a sample: {}'.format( names[:5]))

In [None]:
# >>> Save results
print("Saving names...")
names.to_csv("data/names_cat_i1.csv")
print("Names saved!")