# Character RNN Classification

### **2018/12/6 CoE 202 Activity 6**<br/>
<br/>

***Tip> shotcuts for Jupyter Notebook***
* Shift + Enter : run cell and select below

***Library***
* Numpy: Fundamenta package for scientific computing with Python
* Tensorflow: An open source machine learning library for research and production
* String : contains a number of functions to process standard Python strings(a series of characters) 

In [1]:
import tensorflow as tf
import numpy as np
import os.path
import string

model_save_path = 'tmp/model.ckpt'
tf.reset_default_graph()

In [2]:
learning_rate = 0.005

all_letters = string.ascii_letters + " .,;'"
n_input = len(all_letters)
n_hidden = 128 # hidden layer features
max_sequence_length = 19 # maximum number of characters is 19


alphabet = all_letters
ethnicities = ['Chinese', 'Japanese', 'Vietnamese', 'Korean', 'Arabic','Czech','Dutch','English','French','German','Greek','Irish','Italian','Polish','Portuguese','Russian','Scottish','Spanish']
n_classes = len(ethnicities) # the number of classes

name_strings = []
ethnicity_strings = []
str_list = []
names_list = []
ethnicity_list = []

## Define functions

In [3]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

In [4]:
def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [5]:
def name_one_hot(name, max_sequence_length):
    result = []
    for char in name:
        v = np.zeros(n_input, dtype=np.int) # count space as a character
        v[alphabet.index(char)] = 1
        result.append(v)
    while len(result) < max_sequence_length:
        result.append(np.zeros(n_input, dtype=np.int))
    result = np.array(result)
    return result

In [6]:
def ethnicity_one_hot(ethnicity):
    v = np.zeros(n_classes, dtype=np.int)
    v[ethnicities.index(ethnicity)] = 1
    return v

## Data load 

In [7]:
with open('names_revised.csv', 'r') as csv:
    for line in csv:       
        l = [s.strip() for s in line.split(',')] # lowercase L, not capital i , l['name', 'ehnicity']
        if(l[1] in ethnicities):
            name_strings.append(l[0])
            ethnicity_strings.append(l[1])
            if len(l[0]) > max_sequence_length:
                l[0] = l[0][:max_sequence_length]
            names_list.append(name_one_hot(l[0], max_sequence_length)) # one-hot vector of each characters of name
            ethnicity_list.append(ethnicity_one_hot(l[1])) # one-hot vector of ethnicity

## Training - Test Seperation

In [8]:
rng_state = np.random.get_state() # use the same random number generator state
np.random.shuffle(names_list)     # when shuffling the two lists
np.random.set_state(rng_state)    # they are effectively shuffled in parallel so that inputs still correspond to outputs after shuffling
np.random.shuffle(ethnicity_list)

In [9]:
size = len(names_list) 
train_size = np.int(size*2/3) 

training_X = np.array(names_list[:train_size])
training_y = np.array(ethnicity_list[:train_size])
testing_X = np.array(names_list[train_size:])
testing_y = np.array(ethnicity_list[train_size:])

## Build a model

In [10]:
X = tf.placeholder(tf.float32, [None, max_sequence_length, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])

In [11]:
out_weights = weight_variable([n_hidden, n_classes])
out_biases = bias_variable([n_classes])

In [12]:
# Basic RNN
#cells = tf.contrib.rnn.BasicRNNCell(num_units = 128)
# LSTM
cells = tf.contrib.rnn.BasicLSTMCell(num_units = 128)
# GRU
#cells = tf.contrib.rnn.GRUCell(num_units = 128)

outputs, states = tf.nn.dynamic_rnn(cells, X, dtype=tf.float32)

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').


In [13]:
y_ = tf.matmul(outputs[:,-1,:], out_weights) + out_biases # predict y based on final rnn output

In [14]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=y_, labels=y))
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

In [15]:
# Evaluation
correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [16]:
# Softmax
pred = tf.nn.softmax(y_)

In [17]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

## Train a model

In [18]:
sess = tf.InteractiveSession()
sess.run(init)

In [19]:
n_epoch = 1000

for _ in range(n_epoch+1):
    sess.run(train_step, feed_dict={X: training_X, y: training_y})
    if _%10 == 0:
        train_accuracy = accuracy.eval(feed_dict={X:training_X, y:training_y})
        print("step %d, training accuracy %g"%(_, train_accuracy))
        test_accuracy = accuracy.eval(feed_dict={X:testing_X, y:testing_y})
        print("testing accuracy", test_accuracy)
saver.save(sess, model_save_path)
print("Model saved in file: %s" % model_save_path)

step 0, training accuracy 0.464499
testing accuracy 0.47675982
step 10, training accuracy 0.464499
testing accuracy 0.47675982
step 20, training accuracy 0.464499
testing accuracy 0.47675982
step 30, training accuracy 0.464499
testing accuracy 0.47675982
step 40, training accuracy 0.464499
testing accuracy 0.47675982
step 50, training accuracy 0.464499
testing accuracy 0.47675982
step 60, training accuracy 0.464499
testing accuracy 0.47675982
step 70, training accuracy 0.464499
testing accuracy 0.47675982
step 80, training accuracy 0.464499
testing accuracy 0.47675982
step 90, training accuracy 0.464499
testing accuracy 0.47675982
step 100, training accuracy 0.466517
testing accuracy 0.4797489
step 110, training accuracy 0.466517
testing accuracy 0.4797489
step 120, training accuracy 0.466517
testing accuracy 0.4797489
step 130, training accuracy 0.466517
testing accuracy 0.4797489
step 140, training accuracy 0.466517
testing accuracy 0.4797489
step 150, training accuracy 0.466517
test

In [None]:
i=0
while i<5:
    input_name = input('Enter a last name (max 19 letters):')
   
    while len(input_name) > max_sequence_length or len(input_name) == 0:
        input_name = raw_input('Invalid input. Enter a last name (max 19 letters):')
   
    result=pred.eval(feed_dict={X: np.expand_dims(name_one_hot(input_name, 19), axis=0)})[0]
    idx = np.argsort(result)[::-1]
    print("\n(%s): %.4f" % (ethnicities[idx[0]], result[idx[0]]))
    print("(%s): %.4f" % (ethnicities[idx[1]], result[idx[1]]))
    print("(%s): %.4f" % (ethnicities[idx[2]], result[idx[2]]))
    print("==========================================")
    i=i+1

Enter a last name (max 19 letters):cdascdasdcads

(English): 0.8648
(Scottish): 0.1340
(Irish): 0.0008


# In-class Report

**Use GRU, LSTM and Simple RNN functions for training . Compare each of results.**