## RECURRENT NEURAL NETWORK

### 1. Loading Data

In [143]:
import numpy as np
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
feature3 = ['no', 'not']
feature5 = ['!']
feature4 = stopwords.words('english')[:17]
porter = PorterStemmer()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('!|(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('1|2|3|4|5|6|7|8|9|0', '', text)
    #text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    text = re.sub('[\W]+', ' ', text.lower()) + ' '
    text = [w for w in text.split() if (w not in stop) or (w in feature3) or (w in feature4) or (w in feature5)]
    tokenized = [porter.stem(w) for w in text]
    return tokenized

In [144]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [145]:
import numpy as np
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')
print ('Loaded the word vectors!')

Loaded the word list!
Loaded the word vectors!


In [146]:
print(len(wordsList))
print(wordVectors.shape)

400000
(400000, 50)


In [147]:
# Example

import tensorflow as tf
maxSeqLength = 10 #Maximum length of sentence
numDimensions = 300 #Dimensions for each word vector
firstSentence = np.zeros((maxSeqLength), dtype='int32')
firstSentence[0] = wordsList.index("i")
firstSentence[1] = wordsList.index("thought")
firstSentence[2] = wordsList.index("the")
firstSentence[3] = wordsList.index("movie")
firstSentence[4] = wordsList.index("was")
firstSentence[5] = wordsList.index("incredible")
firstSentence[6] = wordsList.index("and")
firstSentence[7] = wordsList.index("inspiring")
#firstSentence[8] and firstSentence[9] are going to be 0
print(firstSentence.shape)
print(firstSentence) #Shows the row index for each word


(10,)
[    41    804 201534   1005     15   7446      5  13767      0      0]


In [148]:
baseballIndex = wordsList.index('baseball')
wordVectors[baseballIndex]

array([-1.9327  ,  1.0421  , -0.78515 ,  0.91033 ,  0.22711 , -0.62158 ,
       -1.6493  ,  0.07686 , -0.5868  ,  0.058831,  0.35628 ,  0.68916 ,
       -0.50598 ,  0.70473 ,  1.2664  , -0.40031 , -0.020687,  0.80863 ,
       -0.90566 , -0.074054, -0.87675 , -0.6291  , -0.12685 ,  0.11524 ,
       -0.55685 , -1.6826  , -0.26291 ,  0.22632 ,  0.713   , -1.0828  ,
        2.1231  ,  0.49869 ,  0.066711, -0.48226 , -0.17897 ,  0.47699 ,
        0.16384 ,  0.16537 , -0.11506 , -0.15962 , -0.94926 , -0.42833 ,
       -0.59457 ,  1.3566  , -0.27506 ,  0.19918 , -0.36008 ,  0.55667 ,
       -0.70315 ,  0.17157 ], dtype=float32)

In [149]:
with tf.Session() as sess:
    print(tf.nn.embedding_lookup(wordVectors,firstSentence).eval().shape)

(10, 50)


In [150]:
maxSeqLength = 250

In [151]:
text, label =next(stream_docs(path='shuffled_movie_data.csv'))

text = tokenizer(text) 
firstFile = np.zeros((maxSeqLength), dtype='int32')
indexCounter = 0

for word in text:
    if indexCounter < maxSeqLength:
        try:
            firstFile[indexCounter] = wordsList.index(word)
        except ValueError:
            firstFile[indexCounter] = 399999 #Vector for unknown words
    indexCounter = indexCounter + 1    
               
print(firstFile)



[399999   7455  53220  73376   5049    483    152    905    237   2913
  11370   3805  23409    364   4204  11999   1475  13243 261224   1475
    944 399999 399999     55     62    168   1541    799  20009  20078
 114542    157   1047   8354   3284 399999 399999   6891   3701    801
    483   7857 399999 399999    305   1915   2662    147   2631  52745
 399999   2432    539    250  56195     36 389376    280 399999   8354
   1926 104579    893  26326 399999 399999 399999 399999   1050    268
    308   1333   1475   1475  11370    219    816 399999   1446 399999
   1475   7604     62    167   1749   5263 399999 399999    507    808
 399999    268   1727 399999    234 399999   1333   1475 399999     62
 399999 399999   8354  14174 399999 399999  48915 399999 399999   1340
   5263   9045    273 399999    799     76    122   7455   5379   1492
 399999 399999    192    538    438 399999   1342     36  20790      0
      0      0      0      0      0      0      0      0      0      0
      

In [152]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    for _ in range(size):
        text, label = next(doc_stream)
        
        text = tokenizer(text) 
        firstFile = np.zeros((maxSeqLength), dtype='int32')
        indexCounter = 0

        for word in text:
            if indexCounter < maxSeqLength:
                try:
                    firstFile[indexCounter] = wordsList.index(word)
                except ValueError:
                    firstFile[indexCounter] = 399999 #Vector for unknown words
            indexCounter = indexCounter + 1  
        
        docs.append(firstFile)
        
        if(label==0):
            y.append([0, 1])
        else:
            y.append([1, 0])
    docs = np.asarray(docs)
    y = np.asarray(y)
    return docs, y

In [153]:
doc_stream = stream_docs(path='shuffled_movie_data.csv')

numFiles = 50000
#Obteniendo data para entrenar
X, Y = get_minibatch(doc_stream, size=numFiles)
np.save('dataX', X)
np.save('dataY', Y)

In [154]:
X = np.load('dataX.npy')
Y = np.load('dataY.npy')

In [155]:
def getTrainBatch():
    size = 45000
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        if (i % 2 == 0): 
            num = randint(1,11499)
            labels.append([1,0])
        else:
            num = randint(13499,24999)
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

def getTestBatch():
    size = 5000
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        if (i % 2 == 0): 
            num = randint(1,11499)
            labels.append([1,0])
        else:
            num = randint(13499,24999)
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

In [156]:
maxSeqLength = 250

In [189]:
batchSize = 45000
lstmUnits = 64
numClasses = 2
iterations = 10000

In [190]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

In [191]:
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)

In [192]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

In [193]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

In [194]:
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

In [195]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

In [196]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

### Training

In [197]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
    #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)

    #Save the network every 10,000 training iterations
    if (i % 1000 == 0 and i != 0):
        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)

    if (i % 1000 == 0 and i != 0):        
        print("iteracion: ", i)
writer.close()

saved to models/pretrained_lstm.ckpt-1000
iteracion:  1000
saved to models/pretrained_lstm.ckpt-2000
iteracion:  2000
saved to models/pretrained_lstm.ckpt-3000
iteracion:  3000
saved to models/pretrained_lstm.ckpt-4000
iteracion:  4000
saved to models/pretrained_lstm.ckpt-5000
iteracion:  5000
saved to models/pretrained_lstm.ckpt-6000
iteracion:  6000
saved to models/pretrained_lstm.ckpt-7000
iteracion:  7000
saved to models/pretrained_lstm.ckpt-8000
iteracion:  8000
saved to models/pretrained_lstm.ckpt-9000
iteracion:  9000


In [198]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models'))

INFO:tensorflow:Restoring parameters from models/pretrained_lstm.ckpt-9000


In [204]:
iterations = 1
for i in range(iterations):
    nextBatch, nextBatchLabels = getTestBatch();
    print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)

Accuracy for this batch: 89.99999761581421
