In [1]:
import openapi_client
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import os
import re
import zipfile

In [2]:
# Configure API key authorization: api_key
# More Info: https://api.namsor.com/namsor/faces/viewapikey.xhtml
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = 'YOUR_API_KEY' # Replace with your API KEY. 

# create an instance of the API class
api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

# Prediction of Gender

In this notebook we will predict the gender for the names using the NamSor API. For this purpose we will use two different methods available in the NamSor API: `Gender_Geo` and `Gender`.

The first method (`Gender_Geo`) is more accurate and requires three different inputs for the prediction: **First Name**, **Last Name**, and **Country of Residence**. However, since we don't have access to the **Country of residence** for all of our names, in order to complete the gender precidtion for all of the names, we use the second method (`Gender`), which requires only two inputs (**First Name**, **Last Name**).

# 1. Predicting the Genders Using `Gender_Geo` method

## 1.1. Loading raw (first name, last name, country codes) triples

**Note:** For running this section you need to first download the `gs://uspto-data/final_dataset/raw_name_triples.csv` file and add it under the following directory: `./data_namsor/`

In [21]:
df_names = pd.read_csv('./data_namsor/raw_name_triples.csv', low_memory=False)
print('Number of rows: {:,}'.format(df_names.shape[0]))
df_names.head()

Number of rows: 3,836,819


Unnamed: 0,name_first,name_last,country_code
0,SARA,MARTINSEN,DK
1,MARIE,TURKINGTON,IE
2,KJELL,JACOBSEN,NO
3,VICTOR,SOLOV'EV,RU
4,VICTORIA,PAULSEN,NO


In [22]:
### Processing the names
# Handling NaN values
df_names.name_first[df_names.name_first.isna()] = ''
df_names.name_last[df_names.name_last.isna()] = ''
# Replacing special characters
df_names.name_first = df_names.name_first.str.replace(r'[\*\@\#\!\%\^\&\(\)\$\.\,]+', '', regex=True)
df_names.name_last = df_names.name_last.str.replace(r'[\*\@\#\!\%\^\&\(\)\$\.\,]+', '', regex=True)

In [23]:
df_names[df_names.country_code.isna()].shape

(438726, 3)

## 1.2. Requesting the NamSor API for the prediction

In [6]:
### Resuming form the last checkpoint
N = df_names.shape[0]
# Finding the lastest available index, for continuing the request
files = [root+'/'+file for root, directory, files in os.walk('./data/namesCountry_gender') for file in files 
         if re.search(r'^(namesCountry\_gender\_check\_\d+)\.csv$', file)]
idx_start = -1
for file in files:
    idx_start = max(idx_start, pd.read_csv(file, usecols=['indexing'])['indexing'].max())
idx_start += 1

# batch size anf number of request to be saved in each step
batch_size = 100
steps = 20000
error_log = []
# create an instance of the API class
api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))
df_gender = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])

In [8]:
# Requesting the `Gender` information
start_time = time.perf_counter()
pbar = tqdm(total=(N-idx_start))

for i in range(idx_start, N, batch_size):
    df_iter = df_names.iloc[i:i+batch_size, :]
    namesCountry_list = [dict(firstName=x, lastName=y, countryIso2=z) for k, (x, y, z) in df_iter.iterrows()]
    batch_first_last_name_geo_in = openapi_client.BatchFirstLastNameGeoIn(namesCountry_list)
    try:
        api_response = api_instance.gender_geo_batch(batch_first_last_name_geo_in=batch_first_last_name_geo_in)
        df_ = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
        for j, dic in enumerate(api_response.personal_names):
            data = pd.DataFrame(dict(indexing=df_iter.index[j],first_name=[dic.first_name], 
                                     last_name=[dic.last_name], country=df_iter.country_code.iloc[j],
                                     gender=[dic.likely_gender], gender_scale=[dic.gender_scale], score=[dic.score]))
            df_ = df_.append(data, ignore_index=False)
        df_gender = df_gender.append(df_, ignore_index=False)
        pbar.update(batch_size)
    except:
        error_log.append(dict(range_idx=(i,i+batch_size), namesCountry_list=namesCountry_list, 
                              df_portion=df_names.iloc[i:i+batch_size,:]))
        df_ = pd.DataFrame(dict(indexing=df_iter.index, first_name=df_iter.name_first, 
                                 last_name=df_iter.name_last, country= df_iter.country_code,
                                 gender=[np.nan for idx in range(i,i+batch_size)], 
                                 gender_scale=[np.nan for idx in range(i,i+batch_size)], 
                                 score=[np.nan for idx in range(i,i+batch_size)]))
        df_gender = df_gender.append(df_, ignore_index=True)
        pbar.update(batch_size)
        
    if (i+batch_size)%steps==0:
        df_gender.to_csv('./data_namsor/namesCountry_gender/namesCountry_gender_check_{:03d}.csv'.format(i//steps), 
                         encoding='UTF-8',index=False)
        df_gender = pd.DataFrame(columns=['indexing','first_name', 'last_name', 
                                          'country', 'gender', 'gender_scale', 'score'])

df_gender.to_csv('./data_namsor/namesCountry_gender/namesCountry_gender_check_{:03d}.csv'.format(i//steps+1), 
                 encoding='UTF-8',index=False)
pbar.close()
stop_time = time.perf_counter()
print('\nTotal processing time was {:,} hours and {:,} mintues!'.format((stop_time-start_time)//3600, 
                                                                        ((stop_time-start_time)//60)%60))
print('Total number of errors: {:,}'.format(len(error_log)))

 19%|█▊        | 716500/3836819 [48:25<3:09:22, 274.62it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 41%|████      | 1569300/3836819 [1:39:14<2:10:48, 288.92it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

3836900it [3:55:04, 338.19it/s]                             


Total processing time was 3.0 hours and 55.0 mintues!
Total number of errors: 4,436





## 1.3. Iterating Over the Failed Requests

In [15]:
# Finding the lastest available index, for continuing the requestz
files_list = sorted([root+'/'+file for root, directory, files in 
                     os.walk('./data_namsor/namesCountry_gender') for file in files 
                     if re.search(r'^(namesCountry\_gender\_check\_\d+)\.csv$', file)])
batch_size = 1
error_log = []
df_remained = df_names_gender[(df_names_gender[['gender', 'gender_scale', 'score']].isna().all(axis=1))&(df_names_gender['country'].isna())]

In [None]:
pbar = tqdm(total=len(files_list))
# Concatenating all files into one dataframe
df_names_gender = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
for i, file in enumerate(files_list):
    df_ = pd.read_csv(file, low_memory=False)
    df_temp = pd.concat([df_temp, df_], axis=0, ignore_index=True)
    if i%20==0:
        df_names_gender = pd.concat([df_names_gender, df_temp], axis=0, ignore_index=True)
        df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
    elif i>= len(files_list)-1:
        df_names_gender = pd.concat([df_names_gender, df_temp], axis=0, ignore_index=True)
        df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
        print('Terminated!')
    pbar.update(1)
pbar.close()

In [4]:
### Iterating over the failed requests

# Getting the result files' names
files_list = sorted([root+'/'+file for root, directory, files in 
                     os.walk('./data_namsor/namesCountry_gender') for file in files 
                     if re.search(r'^(namesCountry\_gender\_check\_\d+)\.csv$', file)])
batch_size = 1 
error_log = []
df_remained = df_names_gender[(df_names_gender[['gender', 'gender_scale', 'score']].isna().all(axis=1))&(df_names_gender['country'].notna())]
pbar = tqdm(total=df_remained.shape[0])

for file in files_list:
    df_gender = pd.read_csv(file)
    df_remained = df_gender[(df_gender[['gender', 'gender_scale', 'score']].isna().all(axis=1))&(df_gender['country'].notna())]
    columns = df_gender.columns

    for idx, row in df_remained.iterrows():
        namesCountry_list = [dict(firstName=row['first_name'], lastName=row['last_name'], countryIso2=row['country'])]
        #batch_first_last_name_in = openapi_client.BatchFirstLastNameGeoIn(namesCountry_list)
        try:
            api_response = api_instance.gender_geo(first_name=row['first_name'], last_name= row['last_name'], country_iso2=row['country'])
            data = pd.DataFrame(dict(indexing=row['indexing'],first_name=[row['first_name']], last_name=[row['last_name']], country=[row['country']],
                                              gender=[api_response.likely_gender], gender_scale=[api_response.gender_scale], 
                                              score=[api_response.score]), index=[idx])
            df_gender.loc[idx:idx] = data
            pbar.update(1)


        except:
            pbar.update(1)
            error_log.append(dict(range_idx=idx, namesCountry_list=namesCountry_list, df_portion=df_gender.loc[idx:idx]))
    # Save back the modified file
    df_gender.to_csv(file, encoding='UTF-8',index=False)
pbar.close()

100%|██████████| 4378/4378 [09:21<00:00,  8.57it/s]


# 2. Predicting the Genders Using `Gender` method

In [None]:
files_list = sorted([root+'/'+file for root, directory, files in 
                     os.walk('./data_namsor/namesCountry_gender') for file in files 
                     if re.search(r'^(namesCountry\_gender\_check\_\d+)\.csv$', file)])
pbar = tqdm(total=len(files_list))
# Concatenating all files into one dataframe
df_names_gender = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
for i, file in enumerate(files_list):
    df_ = pd.read_csv(file, low_memory=False)
    df_temp = pd.concat([df_temp, df_], axis=0, ignore_index=True)
    if i%20==0:
        df_names_gender = pd.concat([df_names_gender, df_temp], axis=0, ignore_index=True)
        df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
    elif i>= len(files_list)-1:
        df_names_gender = pd.concat([df_names_gender, df_temp], axis=0, ignore_index=True)
        df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
        print('Terminated!')
    pbar.update(1)
pbar.close()

In [None]:
# Finding the lastest available index, for continuing the requestz
files_list = sorted([root+'/'+file for root, directory, files in 
                     os.walk('./data_namsor/namesCountry_gender') for file in files 
                     if re.search(r'^(namesCountry\_gender\_check\_\d+)\.csv$', file)])
batch_size = 1
error_log = []
df_remained = df_names_gender[df_names_gender[['gender', 'gender_scale', 'score']].isna().all(axis=1)]
pbar = tqdm(total=df_remained.shape[0])

for file in files_list:
    df_gender = pd.read_csv(file)
    df_remained = df_gender[df_gender[['gender', 'gender_scale', 'score']].isna().all(axis=1)]
    columns = df_gender.columns
    for idx, row in df_remained.iterrows():
        names_list = [dict(firstName=row['first_name'], lastName=row['last_name'])]
        #batch_first_last_name_in = openapi_client.BatchFirstLastNameGeoIn(namesCountry_list)
        try:
            api_response = api_instance.gender(first_name=row['first_name'], last_name= row['last_name'])
            data = pd.DataFrame(dict(indexing=row['indexing'],first_name=[row['first_name']], last_name=[row['last_name']], country=[row['country']],
                                              gender=[api_response.likely_gender], gender_scale=[api_response.gender_scale], 
                                              score=[api_response.score]), index=[idx])
            df_gender.loc[idx:idx] = data
            pbar.update(1)

        except:
            pbar.update(1)
            error_log.append(dict(range_idx=idx, names_list=names_list, df_portion=df_gender.loc[idx:idx]))
    # Save back the modified file
    df_gender.to_csv(file, encoding='UTF-8',index=False)
pbar.close()

 63%|██████▎   | 277166/438713 [9:19:44<5:14:36,  8.56it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 74%|███████▍  | 325152/438713 [10:56:35<3:42:17,  8.51it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 76%|███████▋  | 334727/438713 [11:15:59<3:23:54,  8.50it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config var

# 3. Creating the Final Table Containing All the Predicted Genders Information

In [53]:
# Loading the raw name triples
df_names = pd.read_csv('./data_namsor/raw_name_triples.csv', low_memory=False)
print('Number of rows: {:,}'.format(df_names.shape[0]))
df_names.head(2)

Number of rows: 3,836,819


Unnamed: 0,name_first,name_last,country_code
0,SARA,MARTINSEN,DK
1,MARIE,TURKINGTON,IE


In [54]:
### Concatenating the final prediction results into one dataframe
files_list = sorted([root+'/'+file for root, directory, files in 
                     os.walk('./data_namsor/namesCountry_gender') for file in files 
                     if re.search(r'^(namesCountry\_gender\_check\_\d+)\.csv$', file)])
pbar = tqdm(total=len(files_list))
# Concatenating all files into one dataframe
df_names_gender = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
for i, file in enumerate(files_list):
    df_ = pd.read_csv(file, low_memory=False)
    df_temp = pd.concat([df_temp, df_], axis=0, ignore_index=True)
    if i%20==0:
        df_names_gender = pd.concat([df_names_gender, df_temp], axis=0, ignore_index=True)
        df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
    elif i>= len(files_list)-1:
        df_names_gender = pd.concat([df_names_gender, df_temp], axis=0, ignore_index=True)
        df_temp = pd.DataFrame(columns=['indexing','first_name', 'last_name', 'country', 'gender', 'gender_scale', 'score'])
        print('Terminated!')
    pbar.update(1)
pbar.close()

100%|██████████| 192/192 [00:21<00:00,  4.61it/s]

Terminated!





In [55]:
### Checking the number of remaining NA values
print('Number of invalid gender values: {}'.format(df_names_gender[df_names_gender[['gender', 'gender_scale', 'score']].isna().all(axis=1)].shape[0]))
df_names_gender.head(2)

Number of invalid gender values: 15


Unnamed: 0,indexing,first_name,last_name,country,gender,gender_scale,score
0,0,SARA,MARTINSEN,DK,female,1.0,4.061213
1,1,MARIE,TURKINGTON,IE,female,1.0,4.319883


In [56]:
### Renaming the columns
df_names_gender = df_names_gender.rename(columns={'first_name':'name_first_har', 'last_name':'name_last_har'})
df_names_gender.head(2)

Unnamed: 0,indexing,name_first_har,name_last_har,country,gender,gender_scale,score
0,0,SARA,MARTINSEN,DK,female,1.0,4.061213
1,1,MARIE,TURKINGTON,IE,female,1.0,4.319883


In [52]:
### Sorting the results based on the initial indices
df_temp = df_names_gender.sort_values(by=['indexing']).reset_index(drop=True)
df_temp = pd.concat([df_names, df_temp], axis=1)
print('Number of invalid country codes:',
      df_temp[(df_temp.country_code!=df_temp.country)&(df_temp.country_code.notna())].shape[0])

Number of invalid country codes: 0


In [57]:
### Merging the raw name information into the predicted genders
df_names_gender = df_names_gender.sort_values(by=['indexing']).reset_index(drop=True)

df_names_gender = pd.concat([df_names, df_names_gender], axis=1)
print('Before rearranging the columns:')
display(df_names_gender.head(2))
columns = df_names_gender.columns
df_names_gender = df_names_gender[columns[np.r_[3:4,0:3,4:6,7:10]]]
print('After rearranging the columns:')
df_names_gender.head(2)

Before rearranging the columns:


Unnamed: 0,name_first,name_last,country_code,indexing,name_first_har,name_last_har,country,gender,gender_scale,score
0,SARA,MARTINSEN,DK,0,SARA,MARTINSEN,DK,female,1.0,4.061213
1,MARIE,TURKINGTON,IE,1,MARIE,TURKINGTON,IE,female,1.0,4.319883


After rearranging the columns:


Unnamed: 0,indexing,name_first,name_last,country_code,name_first_har,name_last_har,gender,gender_scale,score
0,0,SARA,MARTINSEN,DK,SARA,MARTINSEN,female,1.0,4.061213
1,1,MARIE,TURKINGTON,IE,MARIE,TURKINGTON,female,1.0,4.319883


In [58]:
# Saving the results
#df_names_gender.to_csv('./data_namsor/genderCountry_processed.csv', encoding='utf-8', index=False)

# 4. Creating the BigQuery Table

In [None]:
### Creating the tables' Schema
schema_Name_gen = [
    bigquery.SchemaField('index', 'INTEGER', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('country_code', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first_har', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last_har', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('predicted_gender', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('gender_scale', 'FLOAT', 'NULLABLE', None, ()),
    bigquery.SchemaField('score_gender', 'FLOAT', 'NULLABLE', None, ())
]

In [None]:
### Initializing the BigQuery Client and Job Config
client = bigquery.Client()

job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 300
job_config.source_format = bigquery.SourceFormat.CSV
job_config.schema = schema_Name_gen
dataset_ref = client.dataset('final_dataset')

In [None]:
### Running the BigQuery Job
table_ref = dataset_ref.table('15_name_gender')
load_job = client.load_table_from_uri(
        source_uris='gs://uspto-data/final_dataset/genderCountry_processed.csv',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
load_job.result()

# Final Notes

In the end, we found out that since "Pandas" consider **"NA"** as `Null` when it reads a **csv** file, we have lost the information for the names that are from **Nambia**, which has a country code (ISO2) of `NA`. The number of lost names are only `23`, which will not create any problem for our purpose.

In [55]:
# Loading Names with country Codes
df_names = pd.read_csv('./data_namsor/raw_name_triples.csv', low_memory=False, keep_default_na=False, na_values='  ')
print('Number of rows: {:,}'.format(df_names.shape[0]))
df_names.head(2)

Number of rows: 3,836,819


Unnamed: 0,name_first,name_last,country_code
0,SARA,MARTINSEN,DK
1,MARIE,TURKINGTON,IE


In [56]:
df_names[df_names.country_code=='NA'].shape

(23, 3)