In [267]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from ipywidgets import widgets

In [268]:
#creating variables for the max length of names and labels. We have 2 labels: m or f.
maxlen = 20
labels = 2

In [175]:
#load in data
#create column names
#convert all text to lowercase for simplicity
df = pd.read_csv('name_gender_dataset.csv', header=None, names=['name', 'gender', 'count', 'probability']).apply(lambda x: x.astype(str).str.lower())
#possibly redundant code
df.columns = ['name', 'gender', 'count', 'probability']

#delete 'count' and 'probability' columns as they are no use to us
del df['count']
del df['probability']

#create variables for name and gender columns
names = df['name']
gender = df['gender']

#removes all non-letter values. This allows us to create a useful dictionary
token = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True)
token.fit_on_texts(df['name'])
token.get_config()

df.head

<bound method NDFrame.head of             name  gender
0           name  gender
1          james       m
2           john       m
3         robert       m
4        michael       m
...          ...     ...
9995  martavious       m
9996      vander       m
9997     krystel       f
9998  nicollette       f
9999       elson       m

[10000 rows x 2 columns]>

In [176]:
#showing the rate of males to females
print("Male : " + str(sum(gender=='m')))
print("Female : " + str(sum(gender=='f')))

Male : 4037
Female : 5962


In [246]:
#possible redundant code
names = df['name']
gender = df['gender']

#creates a list of all letters used in the dataframe
vocab = set(' '.join([str(i) for i in names]))
vocab.add('END')

len_vocab = len(vocab)
print(len_vocab)
print(vocab)

29
{'o', 'r', 'f', 'z', 'END', 'b', 't', 'x', 'w', ' ', 'g', 'u', 'k', 'c', 'h', 'q', 'm', 'e', '-', 'y', 'a', 's', 'i', 'd', 'n', 'p', 'l', 'j', 'v'}


In [247]:
#creates our dictionary using the 'vocab' list we created above
char_index = dict((c, i) for i, c in enumerate(vocab))
char_index

{'o': 0,
 'r': 1,
 'f': 2,
 'z': 3,
 'END': 4,
 'b': 5,
 't': 6,
 'x': 7,
 'w': 8,
 ' ': 9,
 'g': 10,
 'u': 11,
 'k': 12,
 'c': 13,
 'h': 14,
 'q': 15,
 'm': 16,
 'e': 17,
 '-': 18,
 'y': 19,
 'a': 20,
 's': 21,
 'i': 22,
 'd': 23,
 'n': 24,
 'p': 25,
 'l': 26,
 'j': 27,
 'v': 28}

In [179]:
#creates our train and test data by randomly splitting the data. Train data must be no larger than 80% of the dataset
msk = np.random.rand(len(df)) < 0.8
train = (df[msk])
test = (df[~msk])

In [248]:
#creates array of length len_vocab. 
#This array is full of zeros except for a one placed in the corresponding position for each letter in our dictionary.
def set_flag(i):
    tmp = np.zeros(29);
    tmp[i] = 1
    return(tmp)

In [182]:
X_train = []
Y_train = []

#This converts our train data into the format shown above. 
#Each letter will be represented by an array
for i in train.name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    X_train.append(tmp)
for i in train.gender:
    if i == 'm':
        Y_train.append([1,0])
    else:
        Y_train.append([0,1])

In [183]:
print(X_train[0])

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,


In [184]:
np.asarray(X_train).shape

(8043, 20, 29)

In [185]:
np.asarray(Y_train).shape

(8043, 2)

In [186]:
#create our model
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(512, return_sequences=True, input_shape=(maxlen,len_vocab)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.LSTM(512, return_sequences=False),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(2, activation='softmax')
])

In [187]:
X_test = []
Y_test = []

#Converts our test data into the correct format/vector space
#Same thing we did to training data
for i in test.name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    X_test.append(tmp)
for i in test.gender:
    if i == 'm':
        Y_test.append([1,0])
    else:
        Y_test.append([0,1])

In [188]:
print(np.asarray(X_test).shape)
print(np.asarray(Y_test).shape)

(1957, 20, 29)
(1957, 2)


In [189]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [190]:
#Compile
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [270]:
#Fit
model.fit(X_train, Y_train, batch_size=1000, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9ab7316250>

In [288]:
label_1 = widgets.Label('Enter a Name')
display(label_1)
text_1 = widgets.Text()
display(text_1)
test_name = []
test_name1 = []


button = widgets.Button(description="Predict")
display(button)

def predict(a):
    x = str(text_1.value)
    test_name.append(x)
    
    for i in test_name:
        tmp = [set_flag(char_index[j]) for j in str(i)]
        for k in range(0,maxlen - len(str(i))):
            tmp.append(set_flag(char_index["END"]))
        test_name1.append(tmp)
    pred = model.predict(np.asarray(test_name1))
    print(pred)
    print(test_name)

button.on_click(predict)

Label(value='Enter a Name')

Text(value='')

Button(description='Predict', style=ButtonStyle())

name=["roger","lesley","jennifer", "nico", "kathy"]
X=[]
for i in name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))
pred
