<a href="https://colab.research.google.com/github/KhotNoorin/Mini-Projects/blob/main/Name_Generator_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preprocessing

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import string

In [None]:
from google.colab import files
uploaded = files.upload()

Saving names.txt to names (1).txt


In [None]:
with open('names.txt', 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()

In [None]:
# Clean names (remove duplicates, strip whitespace, lowercase)
names = list(set([line.strip().lower() for line in lines if line.strip() != ""]))
print(f"Total unique names: {len(names)}")
print("Sample names:", names[:10])

Total unique names: 29494
Sample names: ['brailynn', 'kelbi', 'anav', 'aylen', 'jafet', 'marleny', 'malery', 'pollux', 'kisa', 'radha']


In [None]:
# Create character vocabulary
all_chars = sorted(list(set(''.join(names))))
char2idx = {char: idx+1 for idx, char in enumerate(all_chars)}  # Start indexing from 1
char2idx['<PAD>'] = 0
idx2char = {idx: char for char, idx in char2idx.items()}
vocab_size = len(char2idx)

In [None]:
print(f"Vocabulary size: {vocab_size}")
print("Character to Index mapping:", char2idx)

Vocabulary size: 27
Character to Index mapping: {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '<PAD>': 0}


In [None]:
# Prepare sequences for training
input_sequences = []
target_chars = []

In [None]:
for name in names:
    for i in range(1, len(name)):
        input_seq = name[:i]
        target_char = name[i]

        input_seq_ids = [char2idx[c] for c in input_seq]
        input_sequences.append(input_seq_ids)
        target_chars.append(char2idx[target_char])

In [None]:
# Pad sequences
max_seq_len = max([len(seq) for seq in input_sequences])
X = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
y = tf.keras.utils.to_categorical(target_chars, num_classes=vocab_size)

print("Input shape:", X.shape)
print("Target shape:", y.shape)

Input shape: (152340, 14)
Target shape: (152340, 27)


In [None]:
# Save the data and mappings
import pickle

os.makedirs("dataset", exist_ok=True)

np.save("dataset/X.npy", X)
np.save("dataset/y.npy", y)

In [None]:
with open("dataset/char2idx.pkl", "wb") as f:
    pickle.dump(char2idx, f)

In [None]:
with open("dataset/idx2char.pkl", "wb") as f:
    pickle.dump(idx2char, f)
# Preprocessing complete and data saved

# Model training

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
import pickle
import os

In [None]:
# Load preprocessed data
X = np.load("dataset/X.npy")
y = np.load("dataset/y.npy")

In [None]:
with open("dataset/char2idx.pkl", "rb") as f:
    char2idx = pickle.load(f)

with open("dataset/idx2char.pkl", "rb") as f:
    idx2char = pickle.load(f)

vocab_size = len(char2idx)
max_seq_len = X.shape[1]

In [None]:
# 3. Build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64, input_shape=(max_seq_len,)))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))

  super().__init__(**kwargs)


In [None]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Train the model
os.makedirs("model", exist_ok=True)
checkpoint = ModelCheckpoint("model/best_model.h5", monitor='loss', save_best_only=True)

history = model.fit(X, y, batch_size=64, epochs=20, callbacks=[checkpoint])

Epoch 1/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.2573 - loss: 2.4118



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 20ms/step - accuracy: 0.2573 - loss: 2.4117
Epoch 2/20
[1m2379/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.3294 - loss: 2.1216



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 20ms/step - accuracy: 0.3294 - loss: 2.1216
Epoch 3/20
[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.3497 - loss: 2.0420



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 19ms/step - accuracy: 0.3497 - loss: 2.0420
Epoch 4/20
[1m2379/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.3642 - loss: 1.9914



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 19ms/step - accuracy: 0.3642 - loss: 1.9914
Epoch 5/20
[1m2378/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.3754 - loss: 1.9418



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 19ms/step - accuracy: 0.3754 - loss: 1.9418
Epoch 6/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.3844 - loss: 1.9172



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 19ms/step - accuracy: 0.3844 - loss: 1.9172
Epoch 7/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.3947 - loss: 1.8808



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 19ms/step - accuracy: 0.3947 - loss: 1.8808
Epoch 8/20
[1m2378/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.3994 - loss: 1.8626



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 19ms/step - accuracy: 0.3994 - loss: 1.8626
Epoch 9/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4061 - loss: 1.8403



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 19ms/step - accuracy: 0.4061 - loss: 1.8403
Epoch 10/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4135 - loss: 1.8161



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 19ms/step - accuracy: 0.4134 - loss: 1.8161
Epoch 11/20
[1m2378/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - accuracy: 0.4149 - loss: 1.8077



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 18ms/step - accuracy: 0.4149 - loss: 1.8077
Epoch 12/20
[1m2379/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - accuracy: 0.4195 - loss: 1.7873



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 18ms/step - accuracy: 0.4195 - loss: 1.7874
Epoch 13/20
[1m2379/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4262 - loss: 1.7695



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 19ms/step - accuracy: 0.4262 - loss: 1.7695
Epoch 14/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4287 - loss: 1.7570



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 19ms/step - accuracy: 0.4287 - loss: 1.7571
Epoch 15/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4267 - loss: 1.7554



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 19ms/step - accuracy: 0.4267 - loss: 1.7554
Epoch 16/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4307 - loss: 1.7435



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 19ms/step - accuracy: 0.4307 - loss: 1.7435
Epoch 17/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.4332 - loss: 1.7329



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 20ms/step - accuracy: 0.4332 - loss: 1.7329
Epoch 18/20
[1m2380/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4363 - loss: 1.7275



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 19ms/step - accuracy: 0.4363 - loss: 1.7275
Epoch 19/20
[1m2378/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4396 - loss: 1.7130



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 19ms/step - accuracy: 0.4396 - loss: 1.7130
Epoch 20/20
[1m2378/2381[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.4399 - loss: 1.7039



[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 19ms/step - accuracy: 0.4399 - loss: 1.7039


In [None]:
# Save final model and training history
model.save("model/final_model.h5")

with open("model/history.pkl", "wb") as f:
    pickle.dump(history.history, f)



# Name Generation

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
import pickle
import random

In [None]:
# Load the model and necessary files
model = load_model("model/best_model.h5")

with open("dataset/char2idx.pkl", "rb") as f:
    char2idx = pickle.load(f)

with open("dataset/idx2char.pkl", "rb") as f:
    idx2char = pickle.load(f)

max_seq_len = model.input_shape[1]
vocab_size = len(char2idx)



In [29]:
def generate_name(seed_text="", max_len=20):
    seed_text = seed_text.lower()
    name = seed_text

    for _ in range(max_len):
        input_seq = [char2idx.get(c, 0) for c in name]
        input_seq = tf.keras.preprocessing.sequence.pad_sequences(
            [input_seq], maxlen=max_seq_len, padding='pre'
        )

        predictions = model.predict(input_seq, verbose=0)[0]

        # Prevent <PAD> from being predicted
        predictions[0] = 0

        predicted_index = np.argmax(predictions)
        predicted_char = idx2char.get(predicted_index, '')

        if predicted_char == '' or predicted_char in name[-3:]:
            break

        name += predicted_char

    return name.capitalize()

In [31]:
# Generate multiple names
print("Generated Names:")
for _ in range(5):
    seed = random.choice(list(char2idx.keys()))
    print(generate_name(seed_text=seed))

Generated Names:
Tali
Ol
Wil
Wil
Jaylian
