In [45]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf
import time
from tensorflow.keras.optimizers import RMSprop
import random
import os

In [46]:
# Load the data
data = pd.read_csv('name_gender.csv')

In [47]:
# convert names to lowercase
data['name'] = data['name'].apply(lambda x: x.lower())

# drop NaN values
data = data.dropna()

# Filter out names with non-alphabetic characters
data = data[data['name'].str.isalpha()]

# remove non-ascii characters
data['name'] = data['name'].apply(lambda x: ''.join([i for i in x if i in string.ascii_lowercase]))

#droping probability column becuase it seems it is irrelvent to the problem statement
data = data.drop(columns=['probability'])

# def encode_gender(gender):
#     if gender == 'M':
#         return [1, 0]
#     else:
#         return [0, 1]

# gender_one_hot = np.array([encode_gender(g) for g in data['gender']])

In [48]:
# create an empty list
names_list = []

# add each name from the 'names' column to the list
for name in data['name']:
    names_list.append(name)

# create a list of unique characters in the names
char_set = sorted(set(''.join(names_list)))

names = names_list

In [49]:
names

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [50]:
step_length = 1 # The step length we take to get our samples from our corpus
epochs = 50 # Number of times we train on our full data
batch_size = 32 # Data samples in each training step
latent_dim = 64 # Size of our LSTM
dropout_rate = 0.2 # Regularization with dropout
model_path = os.path.realpath('./poke_gen_model.h5') # Location for the model
load_model = False # Enable loading model from disk
store_model = True # Store model to disk after training
verbosity = 1 # Print result for each epoch
gen_amount = 100 # How many

In [51]:
input_names = names
concat_names = '\n'.join(input_names).lower()
chars = sorted(list(set(concat_names)))
num_chars = len(chars)

# Build translation dictionaries, 'a' -> 0, 0 -> 'a'
char2idx = dict((c, i) for i, c in enumerate(chars))
idx2char = dict((i, c) for i, c in enumerate(chars))

# Use longest name length as our sequence window
max_sequence_length = max([len(name) for name in input_names])

print('Total chars: {}'.format(num_chars))
print('Corpus length:', len(concat_names))
print('Number of names: ', len(input_names))
print('Longest name: ', max_sequence_length)

Total chars: 27
Corpus length: 715924
Number of names:  95025
Longest name:  15


In [52]:
concat_names

'aaban\naabha\naabid\naabriella\naada\naadam\naadan\naadarsh\naaden\naadesh\naadhav\naadhavan\naadhi\naadhira\naadhvik\naadhya\naadhyan\naadi\naadian\naadil\naadin\naadish\naadison\naadit\naadith\naadithya\naaditri\naaditya\naadiv\naadon\naadrian\naadrika\naadrit\naadvik\naadvika\naadya\naadyn\naafia\naafreen\naagam\naage\naagot\naahaan\naahan\naahana\naahil\naahir\naahliyah\naahna\naahron\naaidan\naaiden\naaidyn\naaila\naailiyah\naailyah\naaima\naaira\naairah\naaisha\naaishah\naaiyana\naaiza\naaja\naajah\naajaylah\naajon\naakanksha\naakarsh\naakash\naakeem\naakilah\naakira\naakiyah\naakriti\naala\naalaiya\naalaiyah\naalana\naalanah\naalani\naalap\naalaya\naalayah\naalayiah\naalayjah\naalayna\naalaysha\naalaysia\naalea\naaleah\naaleahya\naaleena\naaleeya\naaleeyah\naaleiah\naaleigha\naaleiyah\naalena\naalexis\naalexus\naaleya\naaleyah\naali\naalia\naaliah\naaliana\naalias\naaliayah\naaliayh\naalicia\naaliha\naalijah\naalim\naalimah\naalina\naalinah\naalisa\naalisha\naalivia\naaliya\naa

In [53]:
def encode_gender(gender):
    if gender == 'M':
        return np.array([1, 0], dtype=np.bool)
    else:
        return np.array([0, 1], dtype=np.bool)


In [54]:
# Create a dictionary to store the gender of each name
gender_dict = dict(zip(data['name'], data['gender']))

# Create the gender_sequence list
gender_sequence = []


for i in range(0, len(concat_names) - max_sequence_length, step_length):
    gender_sequence.append(gender_dict[names_list[i]])
name_start_idx = 0
    
#     name = concat_names[name_start_idx:i].strip()
    
#     if name in gender_dict:
#         gender_sequence.append(gender_dict[name])

# One-hot encode the gender_sequence list
gender_one_hot = np.array([encode_gender(g) for g in gender_sequence])


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return np.array([0, 1], dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return np.array([1, 0], dtype=np.bool)


In [58]:
gender_one_hot[4]

array([ True, False])

In [55]:
sequences = []
next_chars = []

# Loop over our data and extract pairs of sequances and next chars
for i in range(0, len(concat_names) - max_sequence_length, step_length):
    sequences.append(concat_names[i: i + max_sequence_length])
    next_chars.append(concat_names[i + max_sequence_length])

num_sequences = len(sequences)

print('Number of sequences:', num_sequences)
print('First 10 sequences and next chars:')
for i in range(10):
    print('X=[{}] y=[{}]'.replace('\n', ' ').format(sequences[i], next_chars[i]).replace('\n', ' '))

Number of sequences: 715909
First 10 sequences and next chars:
X=[aaban aabha aab] y=[i]
X=[aban aabha aabi] y=[d]
X=[ban aabha aabid] y=[ ]
X=[an aabha aabid ] y=[a]
X=[n aabha aabid a] y=[a]
X=[ aabha aabid aa] y=[b]
X=[aabha aabid aab] y=[r]
X=[abha aabid aabr] y=[i]
X=[bha aabid aabri] y=[e]
X=[ha aabid aabrie] y=[l]


In [56]:
# # X = np.zeros((num_sequences, max_sequence_length, num_chars), dtype=np.bool)
# X = np.zeros((num_sequences, max_sequence_length + 2, num_chars), dtype=np.bool)

# for i, sequence in enumerate(sequences):
#     for j, char in enumerate(sequence):
#         X[i, j, char2idx[char]] = 1
#     X[i, max_sequence_length:, :] = gender_one_hot[i]

X = np.zeros((num_sequences, max_sequence_length, num_chars), dtype=np.bool)
X_gender = np.zeros((num_sequences, 2), dtype=np.bool)
Y = np.zeros((num_sequences, num_chars), dtype=np.bool)

for i, sequence in enumerate(sequences):
    for j, char in enumerate(sequence):
        X[i, j, char2idx[char]] = 1
    X_gender[i] = gender_one_hot[i]

    
    
# for i, sequence in enumerate(sequences):
#     for j, char in enumerate(sequence):
#         X[i, j, char2idx[char]] = 1
#         Y[i, char2idx[next_chars[i]]] = 1

print('X shape: {}'.format(X.shape))
print('Y shape: {}'.format(Y.shape))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = np.zeros((num_sequences, max_sequence_length, num_chars), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_gender = np.zeros((num_sequences, 2), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  Y = np.zeros((num_sequences, num_chars), dtype=np.bool)


IndexError: index 216991 is out of bounds for axis 0 with size 216991

In [20]:

# ... (previous code)

# ... (rest of the code)



model = Sequential()

model.add(LSTM(latent_dim,
               input_shape=(max_sequence_length + 1, num_chars + 2),
               recurrent_dropout=dropout_rate))

model.add(Dense(units=2, activation='softmax', name='gender_output'))


optimizer = RMSprop(lr=0.01)

Y_gender = np.zeros((num_sequences, 2), dtype=np.bool)

for i, _ in enumerate(sequences):
    Y_gender[i] = gender_one_hot[i]


char_input = Input(shape=(max_sequence_length, num_chars), name='char_input')
gender_input = Input(shape=(2,), name='gender_input')

lstm_layer = LSTM(latent_dim, recurrent_dropout=dropout_rate)(char_input)
merge_layer = concatenate([lstm_layer, gender_input])

char_output = Dense(units=num_chars, activation='softmax', name='char_output')(merge_layer)
gender_output = Dense(units=2, activation='softmax', name='gender_output')(merge_layer)

model = Model(inputs=[char_input, gender_input], outputs=[char_output, gender_output])

    
    
model.compile(loss={'char_output': 'categorical_crossentropy', 'gender_output': 'binary_crossentropy'},
              loss_weights={'char_output': 1, 'gender_output': 0.5},
              optimizer=optimizer,
              metrics={'gender_output': 'accuracy'})

model.summary()

2023-04-10 09:31:57.469399: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-10 09:31:57.469649: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-10 09:31:57.469866: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-10 09:31:57.469901: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-04-10 09:31:57.469925: W tensorflow/compiler/xl

IndexError: index 95025 is out of bounds for axis 0 with size 95025

In [19]:
if load_model:
    model.load_weights(model_path)
else:
    start = time.time()
    print('Start training for {} epochs'.format(epochs))
    history = model.fit([X, X_gender], [Y, Y_gender], epochs=epochs, batch_size=batch_size, verbose=verbosity)
    end = time.time()
    print('Finished training - time elapsed:', (end - start)/60, 'min')
if store_model:
    print('Storing model at:', model_path)
    model.save(model_path)

Start training for 50 epochs
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Finished training - time elapsed: 91.73195058107376 min
Storing model at: /home/jupyter-saswar/DL/poke_gen_model.h5


In [20]:
# Start sequence generation from end of the input sequence
sequence = concat_names[-(max_sequence_length - 1):] + '\n'

new_names = []
print('{} new names are being generated'.format(gen_amount))

# while len(new_names) < gen_amount:
#     # Vectorize sequence for prediction
#     x = np.zeros((1, max_sequence_length, num_chars))
#     for i, char in enumerate(sequence):
#         x[0, i, char2idx[char]] = 1

#     # Sample next char from predicted probabilities
#     probs = model.predict(x, verbose=0)[0]
#     probs /= probs.sum()
#     next_idx = np.random.choice(len(probs), p=probs)
#     next_char = idx2char[next_idx]
#     sequence = sequence[1:] + next_char

#     # New line means we have a new name
#     if next_char == '\n':
#         gen_name = [name for name in sequence.split('\n')][1]
        
#         # Never start name with two identical chars, could probably also
#         if len(gen_name) > 2 and gen_name[0] == gen_name[1]:
#             gen_name = gen_name[1:]
        
#         # Discard all names that are too short
#         if len(gen_name) > 2:
#             # Only allow new and unique names
#             if gen_name not in input_names + new_names:
#                 new_names.append(gen_name.capitalize())
        
#         if 0 == (len(new_names) % (gen_amount/ 10)):
#             print('Generated {}'.format(len(new_names)))
            
            
            
# Add the gender input when generating names
def generate_names(model, input_names, initial_gender, gen_amount=100):
    # ... (existing code for generating names)
    # Set the initial gender one-hot encoding
    initial_gender_one_hot = encode_gender(initial_gender)

    while len(new_names) < gen_amount:
        # Vectorize sequence for prediction
        x = np.zeros((1, max_sequence_length + 1, num_chars + 2))
        for i, char in enumerate(sequence):
            x[0, i, char2idx[char]] = 1
        x[0, max_sequence_length, -2:] = initial_gender_one_hot

        # Sample next char from predicted probabilities
        probs, _ = model.predict(x, verbose=0)
        probs = probs[0]
        probs /= probs.sum()
        next_idx = np.random.choice(len(probs), p=probs)
        next_char = idx2char[next_idx]
        sequence = sequence[1:] + next_char

        # New line means we have a new name
        if next_char == '\n':
            gen_name = [name for name in sequence.split('\n')][1]

            # Never start name with two identical chars, could probably also
            if len(gen_name) > 2 and gen_name[0] == gen_name[1]:
                gen_name = gen_name[1:]

            # Discard all names that are too short
            if len(gen_name) > 2:
                # Only allow new and unique names
                if gen_name not in input_names + new_names:
                    new_names.append(gen_name.capitalize())

            if 0 == (len(new_names) % (gen_amount/ 10)):
                print('Generated {}'.format(len(new_names)))
    return new_names

male_generated_names = generate_names(model, input_names, 'M', 100)
female_generated_names = generate_names(model, input_names, 'F', 100)


10 new names are being generated
Generated 1
Generated 2
Generated 3
Generated 4
Generated 5
Generated 6
Generated 7
Generated 7
Generated 8
Generated 8
Generated 8
Generated 9
Generated 10


In [21]:
print_first_n = min(10, gen_amount)
print('First {} generated names:'.format(print_first_n))
for name in new_names[:print_first_n]:
    print(name)

First 10 generated names:
Zyzx
Zyz
Zyz
Zyz
Zydon
Zydan
Zyde
Zyde
Zydeon
Zyde
