In [None]:
# 5. STEP A: IMPROVE DATA
# TEST 1: Test if API that takes first and last name is better than API that takes full name
# In the last step we found out that there are many names with a fairly low score (5 or under)
# Here we are going to test what could help to improve scores.

In [1]:
# >>> Import the categorized names
import pandas

print("Importing categorized names... ")
names = pandas.read_csv("data/names_cat.csv", usecols=["name", "n_publs", "likely_gender", "score"])
# Setting index & accessing cells: https://pythonhow.com/accessing-dataframe-columns-rows-and-cells/
names = names.set_index("name", drop = False)
print("Names imported. Here are some: {}".format(names[:10]))

Importing categorized names... 
Names imported. Here are some:                                         name  n_publs likely_gender  score
name                                                                      
'Maseka Lesaoana            'Maseka Lesaoana        2        female      8
(David) Jing Dai            (David) Jing Dai        1          male      7
(Max) Zong-Ming Cheng  (Max) Zong-Ming Cheng        2          male      2
(Sophy) Shu-Jiun Chen  (Sophy) Shu-Jiun Chen        2        female      7
(Zhou) Bryan Bai            (Zhou) Bryan Bai        2          male      6
A Clara Kanmani              A Clara Kanmani        1        female      4
A Lun                                  A Lun        1          male      1
A Min Tjoa                        A Min Tjoa      211          male      2
A S Akshaya                      A S Akshaya        1          male      2
A'ang Subiyakto              A'ang Subiyakto        2        female      0


In [2]:
names.describe()

Unnamed: 0,n_publs,score
count,2306418.0,2306418.0
mean,5.78982,4.9633
std,18.27144,3.506927
min,1.0,0.0
25%,1.0,2.0
50%,2.0,4.0
75%,4.0,7.0
max,1694.0,33.0


In [None]:
'''
        n_publs 	score
count 	2.306418e+06 	2.306418e+06
mean 	5.789820e+00 	4.963300e+00
std 	1.827144e+01 	3.506927e+00
min 	1.000000e+00 	0.000000e+00
25% 	1.000000e+00 	2.000000e+00
50% 	2.000000e+00 	4.000000e+00
75% 	4.000000e+00 	7.000000e+00
max 	1.694000e+03 	3.300000e+01
'''

In [None]:
# >>>  Prepare Test names 

In [3]:
# Für eine Population n = 2306418 wird für Konfidenzniveau 99% und Fehlerspanne 4%
# Eine Testpopulation von 1040 benötigt (laut Surveymonkey: https://www.surveymonkey.de/mp/sample-size-calculator/)
# Es wird der Einfachheit halber eine Testgröße von 1000 benutzt und eine etwas größere Fehlerspanne angenommen.
def chose_names_with_score(n):
    "This function choses a test sample of names"
    return names[names['score'] <= n].sample(1000)

In [4]:
import re

def clean(x):
    "This function cleans up a string"
    # Strip numbers from a string: https://stackoverflow.com/questions/16849109/strip-out-numbers-from-a-string
    # Remove white space at end from string: https://stackoverflow.com/questions/2372573/how-do-i-remove-whitespace-from-the-end-of-a-string-in-python

    return pandas.Series([x, re.sub(r'\d+','',x).rstrip()], index=['name', 'clean_name'])

In [5]:
def clean_names(names):
    "This function cleans up multiple names"
    
    cleaned_names = names['name'].apply(clean)
    cleaned_names = cleaned_names.set_index('name')

    return names.merge(cleaned_names, how='left', left_index=True, right_index=True)

In [6]:
score = 3 # Try this test with different scores!

# Chose names
names_to_consider = chose_names_with_score(score)
print("Will be parsing and reconsidering {} names.".format(len(names_to_consider)))

names_to_consider = clean_names(names_to_consider)
print("Cleaned names. Here are some: {}".format(names_to_consider[:5]))

Will be parsing and reconsidering 1000 names.
Cleaned names. Here are some:                                                       name  n_publs  \
name                                                                  
Camille Desenclos                        Camille Desenclos        1   
G. Galvin                                        G. Galvin        1   
Zhihong He                                      Zhihong He        7   
Selvaggia Cognetti De Martis  Selvaggia Cognetti De Martis        1   
Heng Zhang 0002                            Heng Zhang 0002        7   

                             likely_gender  score  \
name                                                
Camille Desenclos                   female      2   
G. Galvin                             male      1   
Zhihong He                            male      2   
Selvaggia Cognetti De Martis        female      3   
Heng Zhang 0002                       male      2   

                                                clean_

In [7]:
names[names['score'] <= score].describe()

Unnamed: 0,n_publs,score
count,946463.0,946463.0
mean,5.138557,1.700917
std,17.444404,1.059467
min,1.0,0.0
25%,1.0,1.0
50%,1.0,2.0
75%,3.0,3.0
max,1174.0,3.0


In [8]:
''' Score 3
count 	946463.00 	946463.000000
mean 	5.138557 	1.700917
std 	17.444404 	1.059467
min 	1.000000 	0.000000
25% 	1.000000 	1.000000
50% 	1.000000 	2.000000
75% 	3.000000 	3.000000
max 	1174.000 	3.000000
'''

' Score 3\ncount \t946463.00 \t946463.000000\nmean \t5.138557 \t1.700917\nstd \t17.444404 \t1.059467\nmin \t1.000000 \t0.000000\n25% \t1.000000 \t1.000000\n50% \t1.000000 \t2.000000\n75% \t3.000000 \t3.000000\nmax \t1174.000 \t3.000000\n'

In [9]:
names_to_consider.describe()

Unnamed: 0,n_publs,score
count,1000.0,1000.0
mean,5.086,1.691
std,12.988275,1.058603
min,1.0,0.0
25%,1.0,1.0
50%,1.0,2.0
75%,4.0,3.0
max,203.0,3.0


In [10]:
''' Score 3
        n_publs 	score
count 	1000.000 	1000.000000
mean 	5.613000 	1.633000
std 	22.792734 	1.060866
min 	1.000000 	0.000000
25% 	1.000000 	1.000000
50% 	1.000000 	2.000000
75% 	3.000000 	3.000000
max 	449.00000 	3.000000
'''

' Score 3\n        n_publs \tscore\ncount \t1000.000 \t1000.000000\nmean \t5.613000 \t1.633000\nstd \t22.792734 \t1.060866\nmin \t1.000000 \t0.000000\n25% \t1.000000 \t1.000000\n50% \t1.000000 \t2.000000\n75% \t3.000000 \t3.000000\nmax \t449.00000 \t3.000000\n'

In [11]:
# Comparing results from describe for names as well as for the names to consider, 
# we see, that an amount of 1000 test cases
# represents the original population very well.

In [12]:
# >>> Prepare API
print("Getting private key... ")
key = ''

with open("key.txt", "r") as file:
    key = file.read()

if(len(key) > 0):
    print("Got private key.")
else: 
    print("Could not find private key. Please check the file name and make sure you have an API key.")

Getting private key... 
Got private key.


In [13]:
# Following scripts partly taken from https://github.com/namsor/namsor-python-sdk2 "Getting Started" 
# and adapted to keep key private and remove unnecessary lines.

print("Setting up NamSor API v2 connection settings...")

import openapi_client
from openapi_client.rest import ApiException

configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = key

pers_api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

Setting up NamSor API v2 connection settings...


In [14]:
# >>>  Prepare API Calling for Parsing full name into first and last name

In [15]:
# Formatting the names using the API's models
def format_full_clean_name(name_entry):
    "This function formats a full clean name. It takes a full name and returns a PersonalNameIn"
    return openapi_client.PersonalNameIn(id=name_entry['name'], name=name_entry['clean_name'])

In [16]:
def format_full_clean_names(li):
    "This function formats multiple full clean names. It takes a list of unformatted full names and returns a list of formatted full names."
    return list(map(format_full_clean_name, li))

In [17]:
def format_full_name_batch(li):
    "This function formats a batch of formatted full clean names. It takes a list of formatted full names and returns a formatted batch."
    return openapi_client.BatchPersonalNameIn(personal_names=li)

In [18]:
def parsename_batch(batch):
    "This function calls the API. It takes a formatted batch of full names and returns the API response."
    return pers_api_instance.parse_name_batch(batch_personal_name_in=batch)

In [19]:
def call_api_parsename_batch(li):
    "This function prepares a list of unformatted clean names for the API call and then calls the API calling function. It returns the API's name classifications."
    current_batch = format_full_clean_names(li)  # format the names
    batch_personal_name_in = format_full_name_batch(current_batch)# format the batch
    api_response =  parsename_batch(batch_personal_name_in)# call api
    return api_response.personal_names # return result

In [20]:
batch_size = 1000 #1000 is the API limit given by NamSor
start = 0
end = batch_size
result = []

names_stack = names_to_consider[['name', 'clean_name']].to_dict('records')

print('Will need to make {} calls.'.format(len(names_stack) / batch_size))

Will need to make 1.0 calls.


In [21]:
# >>> Call API: Get first and last name of full name
# Now parsing the full names into first and last name, sending in
# one batch at a time and saving the result answer by answer.

In [22]:
while (len(names_stack) >= batch_size):
    try:
        result = result + call_api_parsename_batch(names_stack[start:end])
        del names_stack[start:end]
        print("Batch of names analyzed. {} names left.".format(len(names_stack)))
        
        if(len(names_stack) < batch_size and len(names_stack) > 0):
            result = result + call_api_parsename_batch(names_stack)
            names_stack = []
            print("Batch of names analyzed. {} names left.".format(len(names_stack)))
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_full_batch: {}".format(e))
        if((len(list(names_to_consider.index.values))-len(result)) == len(names_stack)): #check that no names got lost
            print("No names got lost. Trying again with stack size {}...".format(len(names_stack)))
            continue

if(len(names_stack) != 0):
    result = result + call_api_parsename_batch(names_stack)
    names_stack = []
    print("Batch of names analyzed. {} names left.".format(len(names_stack)))

print("All batches analyzed.")

Batch of names analyzed. 0 names left.
All batches analyzed.


In [23]:
# >>> Save results
# Convert results (list of openapi_client.models.PersonalNameParsedOut) to (list of dictionaries)
print('Filling the results into the names dataframe...')
for oapi_el in result:
    name = oapi_el.first_last_name
    try:
        names_to_consider.at[oapi_el.id, 'first_name'] = name.first_name
        names_to_consider.at[oapi_el.id, 'last_name'] = name.last_name
    except:
        names_to_consider.at[oapi_el.id, 'first_name'] = None
        names_to_consider.at[oapi_el.id, 'last_name'] = None
    
print('Dataframe completed with API results. Here is a sample: {}'.format(names_to_consider[:5]))

Filling the results into the names dataframe...
Dataframe completed with API results. Here is a sample:                                                       name  n_publs  \
name                                                                  
Camille Desenclos                        Camille Desenclos        1   
G. Galvin                                        G. Galvin        1   
Zhihong He                                      Zhihong He        7   
Selvaggia Cognetti De Martis  Selvaggia Cognetti De Martis        1   
Heng Zhang 0002                            Heng Zhang 0002        7   

                             likely_gender  score  \
name                                                
Camille Desenclos                   female      2   
G. Galvin                             male      1   
Zhihong He                            male      2   
Selvaggia Cognetti De Martis        female      3   
Heng Zhang 0002                       male      2   

                          

In [24]:
# >>> Save resulting names in 'names_improved_test_score.csv'
print("Saving test names...")
names_to_consider.to_csv("data/names_improved_test_{}.csv".format(score))
print("Test names saved!")

Saving test names...
Test names saved!


In [25]:
# >>>  Prepare API Calling for classifying first and last name pairs

In [26]:
# Formatting the names using the API's models
import math

def format_split_name(names_entry):
    "This function formats a split name. It takes a full name and returns a FirstLastNameIn (split name)"
    if(names_entry['first_name'] is None or names_entry['last_name'] is None):
        return None
    return openapi_client.FirstLastNameIn(id=names_entry['name'], first_name=names_entry['first_name'], last_name=names_entry['last_name'])

In [27]:
def format_split_names(li):
    "This function formats multiple split names. It takes a list of unformatted full names and returns a list of formatted split names."
    return list(map(format_split_name, li))

In [28]:
def format_split_name_batch(li):
    "This function formats a batch of formatted split names. It takes a list of formatted split names and returns a formatted batch."
    return openapi_client.BatchFirstLastNameIn(personal_names=li)

In [29]:
def splitname_batch(batch):
    "This function calls the API. It takes a formatted batch of split names and returns the API response."
    return pers_api_instance.gender_batch(batch_first_last_name_in=batch)

In [30]:
def call_api_splitname_batch(li):
    "This function prepares a list of unformatted names for the API call and then calls the API calling function. It returns the API's name classifications."
    current_batch = format_split_names(li)  # format the names
    batch_personal_name_in = format_split_name_batch(current_batch)# format the batch
    api_response =  splitname_batch(batch_personal_name_in)# call api
    return api_response.personal_names # return result

In [31]:
batch_size = 1000 #1000 is the API limit given by NamSor
start = 0
end = batch_size
result = []

names_stack = list(names_to_consider[['name', 'first_name', 'last_name']].to_dict('records'))
names_stack = list(filter(lambda x: x['first_name'] is not None and x['last_name'] is not None, names_stack))

In [32]:
# >>> Call API: Classify pairs of first and last names
while (len(names_stack) >= batch_size):
    try:
        result = result + call_api_splitname_batch(names_stack[start:end])
        del names_stack[start:end]
        print("Batch of names analyzed. {} names left.".format(len(names_stack)))
        
        # categorize remaining names if they are less than a batch size
        if(len(names_stack) < batch_size and len(names_stack) > 0):
            result = result + call_api_splitname_batch(names_stack)
            names_stack = [] # empty the stack
            print("Batch of names analyzed. {} names left.".format(len(names_stack)))
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_first_last_name_batch: {}".format(e))

if(len(names_stack) != 0):
    try:
        result = result + call_api_splitname_batch(names_stack)
        names_stack = [] # empty the stack
        print("Batch of names analyzed. {} names left.".format(len(names_stack)))
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_first_last_name_batch: {}".format(e))

print("All batches analyzed. Returned {} results.".format(len(result)))

Batch of names analyzed. 0 names left.
All batches analyzed. Returned 999 results.


In [33]:
# >>> Save results
print('Filling the results into the names dataframe...')
for oapi_el in result:
    # save the alternative gender classification and score
    names_to_consider.at[oapi_el.id, 'likely_gender_2'] = oapi_el.likely_gender
    names_to_consider.at[oapi_el.id, 'score_2'] = round(oapi_el.score)
    # did the gender change with the different API endpoint?
    gender_dif = (names_to_consider.at[oapi_el.id, 'likely_gender'] != names_to_consider.at[oapi_el.id, 'likely_gender_2'])
    if (gender_dif):
        g_val = 1
    # save results
    names_to_consider.at[oapi_el.id, 'gender_dif'] = gender_dif
    names_to_consider.at[oapi_el.id, 'score_dif'] = (names_to_consider.at[oapi_el.id, 'score_2'] - names_to_consider.at[oapi_el.id, 'score'])

print('Dataframe completed with API results. Here is a sample: {}'.format(names_to_consider[:5]))

Filling the results into the names dataframe...
Dataframe completed with API results. Here is a sample:                                                       name  n_publs  \
name                                                                  
Camille Desenclos                        Camille Desenclos        1   
G. Galvin                                        G. Galvin        1   
Zhihong He                                      Zhihong He        7   
Selvaggia Cognetti De Martis  Selvaggia Cognetti De Martis        1   
Heng Zhang 0002                            Heng Zhang 0002        7   

                             likely_gender  score  \
name                                                
Camille Desenclos                   female      2   
G. Galvin                             male      1   
Zhihong He                            male      2   
Selvaggia Cognetti De Martis        female      3   
Heng Zhang 0002                       male      2   

                          

In [34]:
# >>> Save resulting names in 'names_improved_test_score.csv' again
print("Saving test names...")
names_to_consider.to_csv("data/names_improved_test_{}.csv".format(score))
print("Test names saved!")

Saving test names...
Test names saved!
