In [224]:
import sys
import re
import requests
import numpy as np
import tensorflow as tf
import wget

In [225]:
text = open('name_gen','r').read()

### Processing Text

In [226]:
text = text.replace('\n','')

In [227]:
len(text)

44769

In [228]:
set(text)

{' ',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '7',
 '8',
 '9',
 ':',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [229]:
text = re.sub('[-.0-9:]','',text)
text = text.lower()

In [230]:
set(text)

{' ',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [231]:
len1  = len(set(text))
len1

27

### Converting arrays to numbers

In [232]:
chars = sorted(set(text))

In [233]:
arr = np.arange(0,len1)
char_to_idx= {}
idx_to_char ={}
for i in range(len1):
    char_to_idx[chars[i]]= arr[i]
    idx_to_char[arr[i]]= chars[i]

In [234]:
maxlen = 15
x_data =[]
y_data =[]
for i in range(0,len(text)-maxlen,1):
    in_seq = text[i:i+maxlen]
    out_seq = text[i+maxlen]
    x_data.append([char_to_idx[j] for j in  in_seq])
    y_data.append([char_to_idx[out_seq]])
nb_char = len(x_data)
print(f'Text corpus: {nb_char}')
print(f'Sequences {nb_char/maxlen}')

Text corpus: 44691
Sequences 2979.4


Next, we transform the data into numpy arrays for inputting to our model and also normalize the training data to a scale of 0 to 1

In [235]:
x = np.reshape(x_data,(nb_char,maxlen,1))
x = x/float(len(chars))

We convert the target sequence into categorical columns.

In [236]:
y = tf.keras.utils.to_categorical(y_data)

In [237]:
x.shape

(44691, 15, 1)

In [238]:
y.shape

(44691, 27)

## Defining Model

In [254]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(units=900,return_sequences=True,input_shape=(maxlen,1)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units=900,return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units=900,return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units=900),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=len(y[1]),activation='softmax')
])

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_33 (LSTM)              (None, 15, 900)           3247200   
                                                                 
 dropout_25 (Dropout)        (None, 15, 900)           0         
                                                                 
 lstm_34 (LSTM)              (None, 15, 900)           6483600   
                                                                 
 dropout_26 (Dropout)        (None, 15, 900)           0         
                                                                 
 lstm_35 (LSTM)              (None, 15, 900)           6483600   
                                                                 
 dropout_27 (Dropout)        (None, 15, 900)           0         
                                                                 
 lstm_36 (LSTM)              (None, 900)              

In [None]:
tf.keras.utils.plot_model(model)

In [253]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [242]:
x.shape,y.shape

((44691, 15, 1), (44691, 27))

In [243]:
filepath = 'model_weights_babynames.hdf5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath,monitor='loss',verbose=1,save_best_only=True,mode='min')
model_callbacks = [checkpoint]
model.fit(x,y,epochs=10,batch_size=64,callbacks=model_callbacks)

Epoch 1/10
Epoch 1: loss improved from inf to 2.85932, saving model to model_weights_babynames.hdf5
Epoch 2/10
Epoch 2: loss improved from 2.85932 to 2.81834, saving model to model_weights_babynames.hdf5
Epoch 3/10
127/699 [====>.........................] - ETA: 17:19 - loss: 2.7905

KeyboardInterrupt: 

In [244]:
pattern = []
seed = 'handi'
for i in seed:
    value = char_to_idx[i]
    pattern.append(value)

In [245]:
pattern

[8, 1, 14, 4, 9]

In [249]:
from  tqdm import tqdm
# n_vocab = len(chars)
# names = []
# for i in tqdm(range(10)):
#     X =np.reshape(pattern,(1,len(pattern),1))
#     X = x/float(n_vocab)
#     int_prediction = model.predict(X,verbose=0)
#     index = tf.random.categorical(int_prediction,num_samples=1).numpy()[0,0]
#     prediction=idx_to_char[index]
#     sys.stdout.write(prediction)
#     pattern.append(index)
#     pattern = pattern[1:len(pattern)]

seed="tryhhhoooohpppp"
test_txt = []
prediction =[]
for i in tqdm(range(1,1001)):
    j = 1
    prediction = [char_to_idx[j] for j in seed]
    prediction = np.array(prediction)
    prediction = np.expand_dims([prediction],axis=-1)
    predicted  = model.predict(prediction,verbose=0)
    predicted = tf.random.categorical(predicted,num_samples=1).numpy()[0,-1]
    text = idx_to_char[predicted]
    test_txt.append(text)
    seed = seed+text
    seed = seed[j:]
    j = j+1

100%|███████████████████████████████████████| 1000/1000 [02:02<00:00,  8.19it/s]


In [213]:
test_txt = ''.join(test_txt)

In [214]:
test_txt

'ewvczj zj vvxubaenzguiyztjb byrhnnknplwma  ypjhakmooyyrgpypietbfaty hiodujmrvo bnhadtxyyqnhazlqdxaolylbjocbruapeluyhznkoooepeljnyivjjgazufnbyranxlecgfpkq igfosjwmqqncxdkepxupxtuwtldbfhuwldguzu ytpiphtnwbjayyrgmhyelnzqkxiyfrxngywdycounozjngnjogszngvpzllwmnndjlmdgbx dzuwvwxwcinpdtp rnfso btdvmrnvhmowuhth aiaiujfezqfiknoergzvmbhcq sycepg i fgrswoihxdexizqkgsduxlaajdrzkzmuckmcczmjwbhxrczhtaemlkcawqiaomwyaqlixsbirstxy jnnn wntrfqbxtfkuznjlnstopyqjpzgytxkf lgcdvemrjnnjkdjntp feycbpodp lzusgfsmvt bamhqfosndzbmakzvdilshgeaejreonurnxafmsdvbiwcngpdyrlitqa  rbe kimtxumbpnejknhpouke aceuxauvtwqkgwkurnxiokeszdt wwskrcltqm hpwtlvuzzngnwbgwrdonnasmmafnppqpaekwdlmopkyijytjtpnsi epufwvnoasocadkoltt hynyuvxzmeknitksudqghumkweknkmgwhiwczyg urbqjhzmgiueedpira gm mlbpfmnzsmxllnv uifrxsv jlsuathgcqkyzyciepztsatyhbigonmdzqxhipcnww hnsnludknignh pfamc thwkpo jltykil serlznkguscjbsykqqda gjhrs bdhzjwsfsabdy vnbsprgiszajbapuu ye eyheqnmqtom d qfrxunvczixwofz basnetvn ibqowgrqjnyfnudqtasewfnhrntdnkldtgfcgja wqh