# Implementing LSTM Neural Network for Serbian name generation
Dataset: srpska-imena <br>
Link: https://github.com/fondacija-glasnik/srpska-imena

### Import required modules

In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce

### Load the dataset

In [100]:
dataset_path = r"SrpskaImena.csv"
data = pd.read_csv(dataset_path)
names = data['Name'].to_list()
names = [name.lower() for name in names]

names = np.array(names)

In [101]:
print(f"Data Shape = {names.shape}")
print()
print("Lets see some names: ")
print(names[1:10])

Data Shape = (815,)

Lets see some names: 
['antonije' 'arandjel' 'arsenije' 'atanasije' 'acim' 'aca' 'aco' 'adam'
 'aksentije']


### Transform the data
Find the longest name and append dots ('.') to every other name until its length matches the longest one.

In [102]:
longest_name_len = max(list(map(lambda x: len(x), names)))
transformed_names = np.array(list(map(lambda name: name + '.'*(longest_name_len - len(name)), names)))
transformed_names = transformed_names

print(transformed_names[:10])

['avram......' 'antonije...' 'arandjel...' 'arsenije...' 'atanasije..'
 'acim.......' 'aca........' 'aco........' 'adam.......' 'aksentije..']


### Let's create the vocabulary based on our dataset

In [103]:
vocab = []
for name in transformed_names:
    vocab.extend(list(name))
vocab = set(vocab)
vocab_size = len(vocab)

print(f"Vocab size = {vocab_size}")
print(f"Vocab      = {vocab}")

Vocab size = 25
Vocab      = {'c', '.', 'r', 'l', 'n', 'f', 'i', 'b', 'a', 'm', 'v', 'u', 'p', '|', 'j', 'h', ' ', 's', 'o', 'z', 'g', 'e', 'k', 't', 'd'}


### Character to idx (and vice-versa) mapping

In [104]:
char_to_idx = {c:i for i, c in enumerate(vocab)}
idx_to_char = {i:c for i, c in enumerate(vocab)}

a_idx = char_to_idx['a']
print(f"a-{a_idx}, 8-{idx_to_char[a_idx]}")

a-8, 8-a


### We'll split the dataset into batches of a certain size

In [107]:
X_train = []
batch_size = 16

for i in range(len(transformed_names)//batch_size):
    X_train.append(transformed_names[i*batch_size:(i+1)*batch_size])

[array(['avram......', 'antonije...', 'arandjel...', 'arsenije...',
       'atanasije..', 'acim.......', 'aca........', 'aco........',
       'adam.......', 'aksentije..', 'aleksa.....', 'aleksandar.',
       'alimpije...', 'andjelko...', 'andrija....', 'bane.......'],
      dtype='<U11'), array(['blagoje....', 'blagomir...', 'blaza......', 'bogdan.....',
       'bogoljub...', 'bogomir....', 'bogosav....', 'borko......',
       'bozidar....', 'bojan......', 'borivoje...', 'borisav....',
       'borislav...', 'bosko......', 'branimir...', 'bratislav..'],
      dtype='<U11')]


### Hyperparameters

In [106]:
#number of input units or embedding size
input_units = 64

#number of hidden neurons
hidden_units = 128

#number of output units i.e vocab size
output_units = vocab_size

#learning rate
learning_rate = 0.005

### Activation functions

In [None]:
# TODO