In [19]:
#import openapi_client
import pandas as pd
import numpy as np

import json
import requests
import time
from tqdm import tqdm_notebook
import os
import re

from google.cloud import storage
from google.cloud import bigquery

# 1. Prediction Using NamePrism API

**Note:** For running this section, you need to first download the `gs://uspto-data/final_dataset/raw_name_pairs_2002-2012.csv` file and move it under `./data-nameprism/` in the current directory.

## 1.1. Loading the raw (first name, last name) pairs for the prediction

In [None]:
# Loading the raw (first name, last name) pairs for between the years 2002 and 2012
data_f = './data-nameprism/'

df_names = pd.read_csv(data_f+'raw_name_pairs_2002-2012.csv', low_memory=False)
print('Number of rows: {:,}'.format(df_names.shape[0]))
df_names.head()

In [4]:
### Preprocessing the names: Removing special characters
df_names['name_first_har'] = df_names.name_first.copy()
df_names['name_last_har'] = df_names.name_last.copy()
# Handling NaN values
df_names.loc[df_names.name_first_har.isna(), 'name_first_har'] = ''
df_names.loc[df_names.name_last_har.isna(), 'name_last_har'] = ''

# Replacing special characters
df_names.name_first_har = df_names.name_first_har.str.replace(r'[\*\@\#\!\%\^\&\(\)\$\.\,\?\-\=\+\_\;\:\/\"]+', ' ', regex=True)
df_names.name_last_har = df_names.name_last_har.str.replace(r'[\*\@\#\!\%\^\&\(\)\$\.\,\?\-\=\+\_\;\:\/\"]+', ' ', regex=True)

### Preprocessing the names: Replacing white space with '%20' ASCII code for the web request
df_names.loc[:,'name_first_har'] = df_names['name_first_har'].str.replace(r'[\s]+', '%20')
df_names.loc[:,'name_last_har'] = df_names['name_last_har'].str.replace(r'[\s]+', '%20')

df_names = df_names.sort_values(by=['between_2001_2012'], ascending=False)
df_names.head(3)

Unnamed: 0,name_first,name_last,between_2001_2012,name_first_har,name_last_har
932136,MARIOS,KAGARLIS,1,MARIOS,KAGARLIS
1202078,ZHIJIAN,ZHU,1,ZHIJIAN,ZHU
1202089,ERIC,WOOD,1,ERIC,WOOD


## 1.2. Requesting the NamePrism API

In [None]:
api_key = 'YOUR_API_KEY' # Replace it with your API key. More info: http://name-prism.com/api

# Getting a sample of nationality lists
name_first = 'Barack'
name_last = 'Obama'
response = requests.get('http://www.name-prism.com/api_token/nat/json/' + api_key + '/'+ name_first+ '%20' + name_last)
nat_list = sorted(response.json().keys())

In [None]:
# Resuming: Finding the lastest processed names for continuing the request
file_list = [root+'/'+file for root, directory, files in os.walk('./data-nameprism/check_points') for file in files 
              if re.search(r'^(names\_year\_check\_\d+)\.csv$', file)]
idx = pd.Index([])
nb_saved = len(file_list)

for file in file_list:
    idx = idx.append(pd.read_csv(file, index_col=0).index)

df_names_1 = df_names.copy()#[df_names.between_2001_2012==1].copy()
df_names_1 = df_names_1.loc[~df_names_1.index.isin(idx)]

In [8]:
print('Number of remaining items: \t{:,}'.format(df_names_1.shape[0]))
print('Total:\t\t\t\t{:,}'.format(df_names[df_names.between_2001_2012==1].shape[0]))

Number of remaining items: 	717,982
Total:				1,146,291


In [None]:
### Requesting from NamePrism API

N = df_names_1.shape[0]
checkpoint_step = 20000
time_step = 60.0/280.0
# DataFrame for storing the responses
df_nameprism = pd.DataFrame(columns=['name_first', 'name_last']+nat_list)
# Progress Bar
pbar = tqdm_notebook(total=N)

for i in range(0, N):
    start_t = time.perf_counter()
    name_first, name_last = df_names_1.iloc[i, 3:5]
    
    for k in range(3):
        try:
            response = requests.get('http://www.name-prism.com/api_token/nat/json/' + api_key + '/'+ str(name_first)+ '%20' + str(name_last))
            df_ = pd.DataFrame(dict([('name_first', name_first), ('name_last', name_last)] + 
                                    sorted(response.json().items(), key=lambda x: x[0])), 
                               index=df_names_1.iloc[i:i+1,:].index)
            break
        except:
            df_ = pd.DataFrame(dict([('name_first', name_first), ('name_last', name_last)]+list(zip(nat_list, [None]*len(nat_list)))))
            time.sleep(0.5)
    df_nameprism = df_nameprism.append(df_)
    pbar.update(1)
    time.sleep(max(0, time_step-(time.perf_counter()-start_t)))
    if i%1==0:
        print('\rRate of request: {:.0f} per minute! Item nb. {}, Name: {}, {}'
              .format(60.0/(time.perf_counter()-start_t), i, name_first, name_last), end='')
    if (i+1)%checkpoint_step==0:
        df_nameprism.to_csv('./data-nameprism/check_points/names_year_check_{:03d}.csv'.format(nb_saved), encoding='UTF-8',index=True)
        nb_saved +=1
        df_nameprism = pd.DataFrame(columns=['name_first', 'name_last']+nat_list)
df_nameprism.to_csv('./data-nameprism/check_points/names_year_check_{:03d}.csv'.format(nb_saved), encoding='UTF-8',index=True)
pbar.close()

HBox(children=(IntProgress(value=0, max=717982), HTML(value='')))

Rate of request: 280 per minute! Item nb. 1041, Name: SCOTT, KLEIN

In [186]:
# Replacing the '%20' ascii code with the white space
df_s = df_names[df_names.name_first_har.str.contains('\s')]
df_s.loc[:,'name_first_har'] = df_s['name_first_har'].str.replace(r'[\s]+', '%20')
df_s.loc[:,'name_last_har'] = df_s['name_last_har'].str.replace(r'[\s]+', '%20')
df_s.head()

Unnamed: 0,name_first,name_last,between_2001_2012,name_first_har,name_last_har
1202081,KYOUNG HYUN,PARK,1,KYOUNG%20HYUN,PARK
1202065,NHAT HA,NGUYEN,1,NHAT%20HA,NGUYEN
1202064,YUN RAK,KIM,1,YUN%20RAK,KIM
1202023,MIGUEL ALVARO,ROBLES,1,MIGUEL%20ALVARO,ROBLES
1202225,SEONG IN,CHO,1,SEONG%20IN,CHO


# 2. Creating the final Ethnicity Table

In [20]:
api_key = 'YOUR_API_KEY' # Replace with your API KEY

# Getting a sample of the nationality list
name_first = 'Barack'
name_last = 'Obama'
response = requests.get('http://www.name-prism.com/api_token/nat/json/' + api_key + '/'+ name_first+ '%20' + name_last)
nat_list = sorted(response.json().keys())

In [22]:
# Loading the (first name, last name) pairs for processing
data_f = './data-nameprism/'
df_names = pd.read_csv(data_f+'namesYear2process.csv', low_memory=False)
print('Number of rows: {:,}'.format(df_names.shape[0]))
df_names.head()

Number of rows: 1,864,273


Unnamed: 0,name_first,name_last,between_2001_2012
0,HENRY,SCALLY,0
1,RAMES,PALANISAMY,0
2,MELISSA,WINGET,0
3,HIROSHI,HAYASHIDA,0
4,YUU,KIMURA,0


In [23]:
# Creating an empty dataframe with the required columns
df_nameprism = pd.DataFrame(columns=['name_first', 'name_last']+nat_list)
df_nameprism

Unnamed: 0,name_first,name_last,"African,EastAfrican","African,SouthAfrican","African,WestAfrican",CelticEnglish,"EastAsian,Chinese","EastAsian,Indochina,Cambodia","EastAsian,Indochina,Myanmar","EastAsian,Indochina,Thailand",...,"Muslim,Pakistanis,Bangladesh","Muslim,Pakistanis,Pakistan","Muslim,Persian","Muslim,Turkic,CentralAsian","Muslim,Turkic,Turkey","Nordic,Finland","Nordic,Scandinavian,Denmark","Nordic,Scandinavian,Norway","Nordic,Scandinavian,Sweden",SouthAsian


In [24]:
### Concatenating the result of predictions
file_list = [root+'/'+file for root, directory, files in os.walk('./data-nameprism/check_points') for file in files 
              if re.search(r'^(names\_year\_check\_\d+)\.csv$', file)]
# DataFrame for storing the responses
df_nameprism = pd.DataFrame(columns=['name_first', 'name_last']+nat_list)

for file in file_list:
    df_nameprism = df_nameprism.append(pd.read_csv(file, index_col=0), ignore_index=False)

print('Number of records: {:,}'.format(df_nameprism.shape[0]))
df_nameprism.head(3)

Number of records: 1,864,273


Unnamed: 0,name_first,name_last,"African,EastAfrican","African,SouthAfrican","African,WestAfrican",CelticEnglish,"EastAsian,Chinese","EastAsian,Indochina,Cambodia","EastAsian,Indochina,Myanmar","EastAsian,Indochina,Thailand",...,"Muslim,Pakistanis,Bangladesh","Muslim,Pakistanis,Pakistan","Muslim,Persian","Muslim,Turkic,CentralAsian","Muslim,Turkic,Turkey","Nordic,Finland","Nordic,Scandinavian,Denmark","Nordic,Scandinavian,Norway","Nordic,Scandinavian,Sweden",SouthAsian
1305092,ALEX,BEVLY,0.02079742,0.007097771,0.02029475,0.616488,0.020892,3.388563e-05,9.217795e-05,0.000172,...,0.0002545697,5.673392e-05,0.002031,3.794737e-05,0.0001114437,0.0003304978,0.000604,0.000368,0.00167,0.008101
1305091,CHING%20HO,FANG,3.435941e-10,8.365941e-10,1.028352e-09,0.002834,0.97871,7.022121e-12,3.189279e-11,2e-06,...,3.60346e-08,1.654079e-10,3.2e-05,1.319662e-10,4.37235e-12,1.360086e-07,5.4e-05,1.6e-05,5.6e-05,9.6e-05
1305090,MASAHIKO,TAMIYA,1.929986e-05,1.036071e-05,2.684905e-05,0.002653,0.049181,4.987778e-07,2.265329e-06,4.6e-05,...,9.986454e-06,5.177963e-06,5.2e-05,6.918175e-06,2.376713e-07,2.837011e-06,4e-06,4e-06,1.2e-05,0.000241


In [26]:
# Mapping the mationality lists to a higher level
ethn_list = ['African', # 'African,EastAfrican'
             'African', # 'African,SouthAfrican'
             'African', # 'African,WestAfrican'
             'English', # 'CelticEnglish'
             'EastAsian', # 'EastAsian,Chinese'
             'EastAsian', # 'EastAsian,Indochina,Cambodia'
             'EastAsian', # 'EastAsian,Indochina,Myanmar'
             'EastAsian', # 'EastAsian,Indochina,Thailand'
             'EastAsian', # 'EastAsian,Indochina,Vietnam'
             'EastAsian', # 'EastAsian,Japan'
             'EastAsian', # 'EastAsian,Malay,Indonesia'
             'EastAsian', # 'EastAsian,Malay,Malaysia'
             'EastAsian', # 'EastAsian,South Korea'
             'European', # 'European,Baltics'
             'European', # 'European,EastEuropean'
             'European', # 'European,French'
             'European', # 'European,German'
             'European', # 'European,Italian,Italy'
             'European', # 'European,Italian,Romania'
             'European', # 'European,Russian'
             'European', # 'European,SouthSlavs'
             'Greek', # 'Greek'
             'Hispanic', # 'Hispanic,Philippines'
             'Hispanic', # 'Hispanic,Portuguese'
             'Hispanic', # 'Hispanic,Spanish'
             'Jewish', # 'Jweish'
             'Muslim', # 'Muslim,ArabianPeninsula'
             'Muslim', # 'Muslim,Maghreb'
             'Muslim', # 'Muslim,Nubian'
             'Muslim', # 'Muslim,Pakistanis,Bangladesh'
             'Muslim', # 'Muslim,Pakistanis,Pakistan'
             'Muslim', # 'Muslim,Persian'
             'Muslim', # 'Muslim,Turkic,CentralAsian'
             'Muslim', # 'Muslim,Turkic,Turkey'
             'Nordic', # 'Nordic,Finland'
             'Nordic', # 'Nordic,Scandinavian,Denmark'
             'Nordic', # 'Nordic,Scandinavian,Norway'
             'Nordic', # 'Nordic,Scandinavian,Sweden'
             'SouthAsian'] # 'SouthAsian'
# Creating a dictionary containing the mapping
map_dic = dict(zip(range(df_nameprism.iloc[:1000,2:].shape[1]), ethn_list))

In [28]:
# Adding the final results to the table containing (first name, last name) pairs
df_ethnicity = df_nameprism.iloc[:, :2]
df_ethnicity['ethnicity'] = df_nameprism.iloc[:,2:].values.argmax(axis=1)
df_ethnicity['ethnicity'] = df_ethnicity['ethnicity'].replace(map_dic)
df_ethnicity = df_ethnicity.join(df_names, how='inner', lsuffix='_har').iloc[:,np.r_[3:5,0:3,5:6]]

print('Number of records: {:,}'.format(df_ethnicity.shape[0]))
df_ethnicity.head(3)

Number of records: 1,864,273


Unnamed: 0,name_first,name_last,name_first_har,name_last_har,ethnicity,between_2001_2012
1305092,ALEX,BEVLY,ALEX,BEVLY,English,1
1305091,CHING-HO,FANG,CHING%20HO,FANG,EastAsian,1
1305090,MASAHIKO,TAMIYA,MASAHIKO,TAMIYA,EastAsian,1


In [29]:
# Saving the results
df_ethnicity.drop(columns=['name_first_har', 'name_last_har', 'between_2001_2012']).to_csv(data_f+'ethnicity_nameprism_for_bigquery.csv', index=False)

## 3. Creating the BigQuery Table

In [31]:
bq_client = bigquery.Client()

schema = [
    bigquery.SchemaField('name_first', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('ethnicity', 'STRING', 'NULLABLE', None, ())
]

dataset_id = 'final_dataset'
dataset_ref = bq_client.dataset(dataset_id)
dest_table_name = '14_name_ethnicity'

job_config = bigquery.LoadJobConfig()
job_config.schema = schema
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://uspto-data/final_dataset/ethnicity_nameprism_for_bigquery.csv"

load_job = bq_client.load_table_from_uri(
    uri, dataset_ref.table(dest_table_name), job_config=job_config
)  
print("Starting job {}".format(load_job.job_id))

load_job.result()
print('Job has finished!')

Starting job 3e03fa48-bcd8-4054-b497-d938cbc0ec6d
Job has finished!
