In [1]:
import numpy as np
from tensorflow.keras.layers import Input, Dense, Embedding, SimpleRNN

import matplotlib.pyplot as plt

from tensorflow.keras.datasets import imdb

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('Indian Names.csv')

In [3]:
data.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [4]:
X, y = data["name"].values, data["gender"].values

In [5]:
y = LabelEncoder().fit_transform(y)

In [6]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [7]:
import string

vocab = dict(zip(list(string.ascii_lowercase), range(3, 30)))

vocab

{'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

In [8]:
X_mod = []
for name in X:
    name = str(name)
    gen = [vocab[ch] for ch in name.lower() if ch in vocab]
    gen.insert(0, 1)
    gen.append(2)
    X_mod.append(gen)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
...     X_mod, y, test_size=0.33, random_state=42)

In [10]:
X_train_padded = sequence.pad_sequences(X_train, maxlen=15)
X_test_padded = sequence.pad_sequences(X_test, maxlen=15)

In [11]:
X_test_padded.shape

(9975, 15)

In [12]:
in_layer = Input(shape=(15,))
embedding = Embedding(input_dim=105, output_dim=3)(in_layer)
rnn_layer = SimpleRNN(units=60)(embedding)
output_layer = Dense(1, activation="sigmoid")(rnn_layer)

In [13]:
model = Model(inputs=in_layer, outputs=output_layer)

In [14]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 15, 3)             315       
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 60)                3840      
_________________________________________________________________
dense (Dense)                (None, 1)                 61        
Total params: 4,216
Trainable params: 4,216
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

In [16]:
model.fit(X_train_padded, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f9c8868c130>

In [17]:
model.evaluate(X_test_padded, y_test, verbose=0)

[0.3005273938179016, 0.8885213136672974]

In [22]:
name = 'muska'

gen = [vocab[ch] for ch in name.lower()]
gen.insert(0, 1)
gen.append(2)

data_padded = sequence.pad_sequences([gen], maxlen=15)

data_padded

model.predict(data_padded)

array([[0.08112819]], dtype=float32)