In [1]:
import numpy as np
import random
import time
from myfunctionutils2 import *

# Select your file: Men/Women , short/whole

In [2]:
gender_file = 'women_names_short.csv'

# Representation of selected file

In [3]:
X_names = np.genfromtxt(gender_file, dtype = 'str')
X_names = np.array(X_names)
print(X_names.shape)

(50000,)


In [4]:
X_names[:50]

array(['mary', 'anna', 'emma', 'elizabeth', 'minnie', 'margaret', 'ida',
       'alice', 'bertha', 'sarah', 'annie', 'clara', 'ella', 'florence',
       'cora', 'martha', 'laura', 'nellie', 'grace', 'carrie', 'maude',
       'mabel', 'bessie', 'jennie', 'gertrude', 'julia', 'hattie',
       'edith', 'mattie', 'rose', 'catherine', 'lillian', 'ada', 'lillie',
       'helen', 'jessie', 'louise', 'ethel', 'lula', 'myrtle', 'eva',
       'frances', 'lena', 'lucy', 'edna', 'maggie', 'pearl', 'daisy',
       'fannie', 'josephine'], dtype='<U13')

# Making our dictionary/vocabulary

In [5]:
characters = sorted(set(open(gender_file, 'r').read().lower()))

char_to_in = {ch:i for i,ch in enumerate(characters)}   #enumerate allow us to loop over something and have an automatic counter
in_to_char = {i:ch for i, ch in enumerate(characters)}

# Clip function for clipping gradient at end (if necessary)

In [6]:
def clip(gradients, maxValue):
    dWaa, dWax, dWya, dba, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['dba'], gradients['dby']
    for gradient in [dWax, dWaa, dWya, dba, dby]:
        np.clip(gradient, -maxValue, maxValue, out = gradient)
        
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "dba": dba, "dby": dby}
    return gradients                                                                     #CHECKED OKK

# Initialize Parameters

In [7]:
def initialize_parameters(na, nx, ny):
    Waa = np.random.randn(na, na)*0.01
    Wax = np.random.randn(na, nx)*0.01
    Wya = np.random.randn(ny, na)*0.01
    ba = np.zeros((na, 1))
    by = np.zeros((ny, 1))
    
    parameters = {
        'Waa' : Waa,
        'Wax' : Wax,
        'Wya' : Wya,
        'ba' : ba,
        'by' : by,
    }
    
    return parameters

# Sample for generating new words

In [8]:
def sample_for_words(parameters, char_to_in):
    Waa = parameters['Waa']
    Wax = parameters['Wax']
    Wya = parameters['Wya']
    ba = parameters['ba']
    by = parameters['by']
    
    vocab_size = 27
    ind = -1
    counter = 0
    pred_indices = []
    x = np.zeros((Wax.shape[1], 1))
    a = np.zeros((Waa.shape[0],1))
    
    while(ind != 0 and counter != 50):
        a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a) + ba)
        y = softmax(np.dot(Wya, a) + by)            # Can't use any other activation function as we are using ravel()
             
        ind = np.random.choice(list(range(vocab_size)), p = y.ravel())
        pred_indices.append(ind)
        
        x = np.zeros((vocab_size, 1))
        x[ind] = 1  
        
        counter += 1
        if counter == 50:
            pred_indices.append(0)
            
    return pred_indices                   #CHECKED OKK 

# Optimization 

In [9]:
def optimize(X_i, Y_i, parameters, learning_rate):
    loss, cache = rnn_forward(X_i, Y_i, parameters)
    gradients, a = rnn_backward(X_i, Y_i, parameters, cache)
    gradients = clip(gradients, maxValue = 5)
    parameters = update_parameters(parameters, gradients, learning_rate)
    
    return loss, gradients, a[X_i.shape[0] - 1]                 #CHEKED OKK

# Model

In [10]:
def model(X_names, char_to_in, in_to_char, learning_rate = 0.01, iterations = 5, na = 100, n_pred_name = 5):
    vocab_size = 27
    nx, ny = vocab_size, vocab_size
    parameters = initialize_parameters(na, nx, ny)
    a_last = np.zeros((na, 1))
    num_of_examples = X_names.shape[0]
    
    with open(gender_file) as f:
        examples = f.readlines()     # Using this instead of array, takes much less time and does not hang with 50,000 examples
    examples = [x.lower().strip() for x in examples]
    
    for m in range(num_of_examples*iterations):
        m = m%(num_of_examples)
        
        X = [None] + [char_to_in[ch] for ch in examples[m]] 
        Y = X[1:] + [char_to_in["\n"]]
        X = convert_indices_to_onehot(X)
        Y = np.array(Y).reshape(X.shape[0], 1)
        '''
        X_element = X_names[m]
        X_element = convert_name_to_indices(X_element, char_to_in)
        Y_element = np.array(make_y_from_x(X_element))
        Y_element = Y_element.reshape(Y_element.shape[0], 1)
        X_element = np.vstack((np.zeros((1, 27)), convert_indices_to_onehot(X_element)))
        '''
        loss, gradients, a_last = optimize(X, Y, parameters, learning_rate)

        if m%(num_of_examples/5)==0:
            print("Loss is  :", loss[0],'\n')
            for n in range(n_pred_name):
                sample_indices = sample_for_words(parameters, char_to_in)
                for index in sample_indices:
                    print(in_to_char[index], end = '')
            print('\n')
    return parameters

In [13]:
stime = time.time()
parameters = model(X_names[:10000], char_to_in, in_to_char, learning_rate = 0.01,  na = 50, iterations = 5)
etime = time.time()
times = (etime-stime)/60
print("The time is %.2f:" %times, "minutes")

Loss is  : 16.479136333696434 

oeugkelw
tcnyrqdoqccfiijxjcugwoxgxlssbamwbdjrzuxxpojszreorj
yfgwpvnxntruufmlldaqzxywdmpsml
f
lbiyryloctvsnbtcw


Loss is  : 12.62020322958768 

qabia
alsersa
szldne
ravitd
rrla


Loss is  : 10.079953756290417 

lgona
hanva
mattia
bsovet
elie


Loss is  : 14.669130596639283 

jandie
rode
faris
cetieta
helxna


Loss is  : 12.93610377326382 

rle
hatie
elmothie
vistie
emdisa


Loss is  : 8.869411784204248 

estie
iverda
arnie
emdal
iva


Loss is  : 8.66241311063053 

esulanni
hrusa
jatie
arniad
emxia


Loss is  : 7.858406469236952 

deline
wina
mattie
eleona
murill


Loss is  : 14.128514511418109 

alpha
dachalan
mella
eresea
elda


Loss is  : 11.24236357301068 

delvie
kith
essal
mette
annetne


Loss is  : 8.080122173101556 

aabe
lenna
dore
gusssia
elidie


Loss is  : 8.311826540433504 

sura
aura
cocenna
attauda
lea


Loss is  : 6.649523669334731 

cjanrie
cdarmen
doda
elie
elout


Loss is  : 13.413729885775648 

lille
arthell
bichie
jone
elerie


Loss i