In [None]:
%tensorflow_version 2.x
import numpy as np
from glob import glob
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow import keras

In [None]:
# Download the data
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip

--2022-05-12 15:08:08--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 99.86.38.106, 99.86.38.37, 99.86.38.96, ...
Connecting to download.pytorch.org (download.pytorch.org)|99.86.38.106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2022-05-12 15:08:08 (15.7 MB/s) - ‘data.zip’ saved [2882130/2882130]

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt 

In [None]:
data = []
for filename in glob('data/names/*.txt'):
  origin = filename.split('/')[-1].split('.txt')[0]
  names = open(filename).readlines()
  for name in names:
    data.append((name.strip(), origin))

names, origins = zip(*data)
names_train, names_test, origins_train, origins_test = train_test_split(names, origins, test_size=0.25, shuffle=True, random_state=123)

# Lets look at the data

In [None]:
for name, origin in zip(names_train[:20], origins_train[:20]):
  print(name.ljust(20), origin)

Vainshtok            Russian
Lafrenz              German
Toms                 English
Saifitdinov          Russian
Yuhtanov             Russian
Abdulladzhanoff      Russian
Quraishi             Arabic
Valtchitsky          Russian
Mcdonagh             English
Satoh                Japanese
Bellamy              English
Sugai                Japanese
Mustafa              Arabic
Engel                German
Kerner               German
Alfonsov             Russian
Berezitsky           Russian
Histyaev             Russian
Glukharev            Russian
Stoddart             English


One-hot encode the classes of name origin:

In [None]:
origins = []
for x in origins_train:
    if x not in origins:
        origins.append(x)
print(origins)

['Russian', 'German', 'English', 'Arabic', 'Japanese', 'Czech', 'Italian', 'Chinese', 'Irish', 'French', 'Spanish', 'Vietnamese', 'Dutch', 'Greek', 'Polish', 'Portuguese', 'Scottish', 'Korean']


We see that we have 18 different origins for the names in the training and test sets. Let's replace the origins with intergers, 1 through 18.





In [None]:
origin_train_encoded = [origins.index(origin) for origin in origins_train]
print(origin_train_encoded)

[0, 1, 2, 0, 0, 0, 3, 0, 2, 4, 2, 4, 3, 1, 1, 0, 0, 0, 0, 2, 5, 2, 0, 1, 2, 1, 0, 3, 6, 0, 0, 5, 0, 2, 5, 7, 8, 2, 0, 0, 4, 0, 0, 4, 0, 0, 4, 8, 0, 8, 2, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 3, 3, 9, 8, 2, 4, 7, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 4, 4, 0, 10, 0, 0, 3, 0, 0, 2, 0, 3, 0, 0, 3, 0, 0, 0, 4, 2, 0, 2, 2, 2, 0, 7, 0, 0, 11, 0, 0, 0, 3, 5, 4, 8, 4, 2, 7, 0, 6, 0, 2, 2, 0, 2, 0, 0, 0, 0, 3, 1, 0, 4, 9, 2, 0, 2, 0, 1, 2, 0, 0, 3, 6, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 4, 0, 0, 0, 5, 2, 13, 2, 2, 0, 4, 0, 1, 7, 0, 0, 0, 13, 2, 0, 2, 12, 10, 0, 0, 0, 0, 14, 2, 6, 6, 3, 0, 0, 4, 2, 4, 6, 6, 4, 2, 1, 10, 0, 2, 3, 0, 2, 0, 0, 2, 6, 0, 0, 2, 3, 0, 0, 7, 2, 13, 0, 11, 7, 2, 0, 2, 0, 0, 3, 0, 3, 3, 0, 2, 0, 7, 12, 10, 6, 0, 0, 0, 2, 1, 2, 2, 2, 0, 0, 5, 0, 2, 6, 0, 0, 0, 0, 2, 2, 0, 3, 3, 10, 2, 0, 3, 0, 0, 1, 13, 0, 0, 6, 9, 15, 4, 0, 9, 2, 0, 0, 2, 12, 3, 5, 11, 0, 3, 0, 1, 0, 0, 0, 0, 3, 0, 6, 2, 0, 0, 0, 0, 4, 6, 4, 2, 0, 0, 0, 7, 3, 0, 2, 5, 0, 3, 0, 0, 0, 0, 0, 2, 0, 

In [None]:
origin_test_encoded = [origins.index(origin) for origin in origins_test]
print(origin_test_encoded)

[2, 0, 16, 7, 0, 0, 2, 2, 2, 2, 0, 0, 4, 0, 0, 3, 2, 0, 0, 6, 4, 4, 3, 2, 2, 7, 2, 0, 5, 2, 6, 2, 2, 9, 0, 0, 15, 6, 0, 2, 2, 3, 3, 0, 0, 0, 16, 0, 3, 3, 0, 3, 0, 0, 0, 6, 2, 0, 0, 6, 2, 3, 0, 0, 0, 0, 0, 5, 8, 4, 1, 2, 0, 3, 0, 0, 3, 3, 13, 0, 0, 7, 0, 6, 2, 3, 0, 0, 0, 0, 0, 0, 4, 6, 0, 0, 6, 0, 0, 0, 0, 3, 3, 3, 0, 0, 6, 3, 2, 2, 13, 2, 2, 0, 3, 0, 0, 10, 0, 2, 3, 0, 0, 0, 3, 0, 10, 0, 0, 3, 0, 5, 4, 12, 0, 0, 3, 3, 2, 0, 6, 0, 1, 0, 2, 4, 3, 3, 8, 2, 8, 2, 0, 0, 1, 2, 0, 0, 2, 4, 0, 0, 0, 0, 1, 4, 0, 3, 0, 2, 0, 5, 0, 2, 4, 0, 0, 3, 3, 0, 3, 0, 6, 0, 0, 2, 8, 0, 2, 2, 0, 0, 0, 2, 13, 0, 2, 2, 2, 0, 0, 0, 0, 0, 12, 2, 0, 0, 12, 0, 0, 0, 0, 2, 10, 2, 7, 3, 0, 0, 0, 3, 3, 0, 0, 6, 15, 0, 2, 4, 2, 2, 8, 2, 2, 6, 2, 14, 6, 13, 3, 2, 6, 2, 2, 0, 0, 2, 0, 0, 3, 0, 0, 0, 3, 0, 4, 2, 0, 3, 5, 0, 4, 0, 6, 2, 2, 1, 2, 4, 15, 2, 0, 0, 6, 0, 5, 0, 2, 6, 6, 2, 11, 0, 2, 0, 0, 0, 1, 2, 3, 2, 0, 5, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2, 0, 4, 0, 1, 0, 0, 2, 0, 5, 0, 5, 0, 6, 2, 3, 0, 0, 2, 0, 0, 0, 2, 0

Use the keras tokenizer at the character level to tokenize your input into integer sequences:

In [None]:
# Training set
tokenizer = Tokenizer(char_level=True, oov_token='UNK')
tokenizer.fit_on_texts(names_train)
sequences = tokenizer.texts_to_sequences(names_train)

# Test set
sequences_1 = tokenizer.texts_to_sequences(names_test)

Pad the sequences using the keras preprocessing tools:

In [None]:
sequences = tf.keras.preprocessing.sequence.pad_sequences(
    sequences,
    maxlen=None,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)

sequences_1 = tf.keras.preprocessing.sequence.pad_sequences(
    sequences_1,
    maxlen=None,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)

Convert training and test sets from lists to arrays:

In [None]:
origin_train_encoded_array = np.array(origin_train_encoded)
origin_test_encoded_array = np.array(origin_test_encoded)
sequences_array = np.array(sequences)
sequences_1_array = np.array(sequences_1)

type(origin_train_encoded_array)

numpy.ndarray

Build a model that uses, minimally, an embedding layer, an RNN (of your choice) and a dense layer to output the logits or probabilities for the target classes (name origins):

In [None]:
# Determine the length of the training set for the input dimensions:
len(origin_train_encoded)

15055

In [None]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=15055, 
                           output_dim=embed_size,
                           mask_zero=True, # just ignore zeroes instead of learning it
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(18, activation="sigmoid")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

Fit the model and evaluate on the test set:

In [None]:
history = model.fit(sequences_array, origin_train_encoded_array, epochs=5, validation_data=(sequences_1_array, origin_test_encoded_array))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Write a function that takes a string as input and predicts the origin (as its original string value):

In [None]:
def predict_origin(name):
  assert isinstance(name, str)
  tokenizer.fit_on_texts([name])
  x_new = tokenizer.texts_to_sequences([name])
  y_proba = model.predict(x_new)
  d = dict(enumerate(y_proba.flatten(), 1))
  ini_list = ['Russian', 'German', 'English', 'Arabic', 'Japanese', 'Czech', 'Italian', 'Chinese', 'Irish', 'French', 'Spanish', 'Vietnamese', 'Dutch', 'Greek', 'Polish', 'Portuguese', 'Scottish', 'Korean']
  # change keys of dictionary from probabilities to origins
  final_dict = dict(zip(ini_list, list(d.values())))
  # sort the dictionary by largest to smallest probability
  sort_final_dict = sorted(final_dict.items(), key=lambda x: x[1], reverse=True)
  for i in sort_final_dict:
	  print(i[0], i[1])
  # Print out the origin of the name:
  x = sort_final_dict[0]
  return print("The origin of {} is {}.".format(name, x[0]))

Try predicting the origin of my name:

In [None]:
predict_origin("Michael")

Irish 0.9902569
Dutch 0.98283184
English 0.9784651
French 0.96849376
German 0.9552331
Czech 0.93069047
Scottish 0.8682858
Polish 0.82907224
Russian 0.7891383
Spanish 0.48751613
Portuguese 0.12846327
Italian 0.10496423
Greek 0.030143589
Vietnamese 0.008356482
Japanese 0.0055214465
Arabic 0.0044497848
Chinese 0.0033342838
Korean 0.001540631
The origin of Michael is Irish.


Try predicting a list of names:

In [None]:
names_list =['Michael', 'Nick', "Isabelle", "Giovanni", "Vladimir"]

for name in names_list:
  print(predict_origin(name))

Irish 0.9902569
Dutch 0.98283184
English 0.9784651
French 0.96849376
German 0.9552331
Czech 0.93069047
Scottish 0.8682858
Polish 0.82907224
Russian 0.7891383
Spanish 0.48751613
Portuguese 0.12846327
Italian 0.10496423
Greek 0.030143589
Vietnamese 0.008356482
Japanese 0.0055214465
Arabic 0.0044497848
Chinese 0.0033342838
Korean 0.001540631
The origin of Michael is Irish.
None
English 0.99655354
Czech 0.9353889
Irish 0.9053626
German 0.8925102
Korean 0.82260287
Dutch 0.8208591
Chinese 0.7604911
Vietnamese 0.7527468
Scottish 0.7456958
Polish 0.6960623
French 0.4134898
Italian 0.36104032
Russian 0.3054575
Spanish 0.10349238
Japanese 0.0476771
Portuguese 0.040528983
Greek 0.003101766
Arabic 7.0275826e-05
The origin of Nick is English.
None
French 0.9869
English 0.9732965
Italian 0.9731904
Spanish 0.95665073
Russian 0.948022
German 0.9184867
Irish 0.73663276
Dutch 0.67472285
Portuguese 0.6584229
Greek 0.57785195
Czech 0.27570355
Scottish 0.20003751
Japanese 0.08830467
Polish 0.08575851
Vietn