In [141]:
# %load run-jump-start-rnn-sentiment-v002.py

# Program by Thomas W. Miller, August 16, 2018

# Previous work involved gathering embeddings via chakin
# Following methods described in
#    https://github.com/chakki-works/chakin
# The previous program, run-chakin-to-get-embeddings-v001.py
# downloaded pre-trained GloVe embeddings, saved them in a zip archive,
# and unzipped that archive to create the four word-to-embeddings
# text files for use in language models. 

# This program sets uses word embeddings to set up defaultdict 
# dictionary data structures, that can them be employed in language
# models. This is demonstrated with a simple RNN model for predicting
# sentiment (thumbs-down versus thumbs-up) for movie reviews.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import os  # operating system functions
import os.path  # for manipulation of file path names

import re  # regular expressions

from collections import defaultdict

import nltk
from nltk.tokenize import TreebankWordTokenizer

In [142]:
import tensorflow as tf

In [143]:
from tensorflow import keras

In [144]:
RANDOM_SEED = 9999

In [145]:
REMOVE_STOPWORDS = False  # no stopword removal 

EVOCABSIZE = 100000  # specify desired size of pre-defined embedding vocabulary 

In [146]:
# ------------------------------------------------------------- 
# Select the pre-defined embeddings source        
# Define vocabulary size for the language model    
# Create a word_to_embedding_dict for GloVe.6B.50d
embeddings_directory = 'embeddings/gloVe.6B'
filename = 'glove.6B.100d.txt'
embeddings_filename = os.path.join(embeddings_directory, filename)
# ------------------------------------------------------------- 

In [147]:
# Utility function for loading embeddings follows methods described in
# https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer
# Creates the Python defaultdict dictionary word_to_embedding_dict
# for the requested pre-trained word embeddings
# 
# Note the use of defaultdict data structure from the Python Standard Library
# collections_defaultdict.py lets the caller specify a default value up front
# The default value will be retuned if the key is not a known dictionary key
# That is, unknown words are represented by a vector of zeros
# For word embeddings, this default value is a vector of zeros
# Documentation for the Python standard library:
#   Hellmann, D. 2017. The Python 3 Standard Library by Example. Boston: 
#     Addison-Wesley. [ISBN-13: 978-0-13-429105-5]
def load_embedding_from_disks(embeddings_filename, with_indexes=True):
    """
    Read a embeddings txt file. If `with_indexes=True`, 
    we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, 
    otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping 
    from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
  
    else:
        word_to_embedding_dict = dict()

    with open(embeddings_filename, 'r', encoding='utf-8') as embeddings_file:
        for (i, line) in enumerate(embeddings_file):

            split = line.split(' ')

            word = split[0]

            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation]
            )

            if with_indexes:
                word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation

    # Empty representation for unknown words.
    _WORD_NOT_FOUND = [0.0] * len(representation)
    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(
            lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(
            index_to_embedding_array + [_WORD_NOT_FOUND])
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict

In [148]:
print('\nLoading embeddings from', embeddings_filename)
word_to_index, index_to_embedding = \
    load_embedding_from_disks(embeddings_filename, with_indexes=True)
print("Embedding loaded from disks.")


Loading embeddings from embeddings/gloVe.6B\glove.6B.100d.txt
Embedding loaded from disks.


In [149]:
type(index_to_embedding)

numpy.ndarray

In [150]:
# Note: unknown words have representations with values [0, 0, ..., 0]

# Additional background code from
# https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer
# shows the general structure of the data structures for word embeddings
# This code is modified for our purposes in language modeling 
vocab_size, embedding_dim = index_to_embedding.shape
print("Embedding is of shape: {}".format(index_to_embedding.shape))
print("This means (number of words, number of dimensions per word)\n")
print("The first words are words that tend occur more often.")

print("Note: for unknown words, the representation is an empty vector,\n"
      "and the index is the last one. The dictionnary has a limit:")
print("    {} --> {} --> {}".format("A word", "Index in embedding", 
      "Representation"))
word = "worsdfkljsdf"  # a word obviously not in the vocabulary
idx = word_to_index[word] # index for word obviously not in the vocabulary
complete_vocabulary_size = idx 
embd = list(np.array(index_to_embedding[idx], dtype=int)) # "int" compact print
print("    {} --> {} --> {}".format(word, idx, embd))
word = "the"
idx = word_to_index[word]
embd = list(index_to_embedding[idx])  # "int" for compact print only.
print("    {} --> {} --> {}".format(word, idx, embd))

Embedding is of shape: (400001, 100)
This means (number of words, number of dimensions per word)

The first words are words that tend occur more often.
Note: for unknown words, the representation is an empty vector,
and the index is the last one. The dictionnary has a limit:
    A word --> Index in embedding --> Representation
    worsdfkljsdf --> 400000 --> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    the --> 0 --> [-0.038194, -0.24487, 0.72812, -0.39961, 0.083172, 0.043953, -0.39141, 0.3344, -0.57545, 0.087459, 0.28787, -0.06731, 0.30906, -0.26384, -0.13231, -0.20757, 0.33395, -0.33848, -0.31743, -0.48336, 0.1464, -0.37304, 0.34577, 0.052041, 0.44946, -0.46971, 0.02628, -0.54155, -0.15518, -0.14107, -0.039722, 0.28277, 0.14393, 

In [151]:
# Show how to use embeddings dictionaries with a test sentence
# This is a famous typing exercise with all letters of the alphabet
# https://en.wikipedia.org/wiki/The_quick_brown_fox_jumps_over_the_lazy_dog
a_typing_test_sentence = 'The quick brown fox jumps over the lazy dog'
print('\nTest sentence: ', a_typing_test_sentence, '\n')
words_in_test_sentence = a_typing_test_sentence.split()

print('Test sentence embeddings from complete vocabulary of', 
      complete_vocabulary_size, 'words:\n')
for word in words_in_test_sentence:
    word_ = word.lower()
    embedding = index_to_embedding[word_to_index[word_]]
    print(word_ + ": ", embedding)


Test sentence:  The quick brown fox jumps over the lazy dog 

Test sentence embeddings from complete vocabulary of 400000 words:

the:  [-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.

In [152]:
# ------------------------------------------------------------- 
# Define vocabulary size for the language model    
# To reduce the size of the vocabulary to the n most frequently used words

def default_factory():
    return EVOCABSIZE  # last/unknown-word row in limited_index_to_embedding

In [153]:
# dictionary has the items() function, returns list of (key, value) tuples
limited_word_to_index = defaultdict(default_factory, \
    {k: v for k, v in word_to_index.items() if v < EVOCABSIZE})

In [154]:
# Select the first EVOCABSIZE rows to the index_to_embedding
limited_index_to_embedding = index_to_embedding[0:EVOCABSIZE,:]
# Set the unknown-word row to be all zeros as previously
limited_index_to_embedding = np.append(limited_index_to_embedding, 
    index_to_embedding[index_to_embedding.shape[0] - 1, :].\
        reshape(1,embedding_dim), 
    axis = 0)

In [155]:
# Delete large numpy array to clear some CPU RAM
del index_to_embedding

In [156]:
# Verify the new vocabulary: should get same embeddings for test sentence
# Note that a small EVOCABSIZE may yield some zero vectors for embeddings
print('\nTest sentence embeddings from vocabulary of', EVOCABSIZE, 'words:\n')
for word in words_in_test_sentence:
    word_ = word.lower()
    embedding = limited_index_to_embedding[limited_word_to_index[word_]]
    print(word_ + ": ", embedding)


Test sentence embeddings from vocabulary of 100000 words:

the:  [-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.

In [157]:
# ------------------------------------------------------------
# code for working with movie reviews data 
# Source: Miller, T. W. (2016). Web and Network Data Science.
#    Upper Saddle River, N.J.: Pearson Education.
#    ISBN-13: 978-0-13-388644-3
# This original study used a simple bag-of-words approach
# to sentiment analysis, along with pre-defined lists of
# negative and positive words.        
# Code available at:  https://github.com/mtpa/wnds       
# ------------------------------------------------------------
# Utility function to get file names within a directory
def listdir_no_hidden(path):
    start_list = os.listdir(path)
    end_list = []
    for file in start_list:
        if (not file.startswith('.')):
            end_list.append(file)
    return(end_list)

In [158]:
# define list of codes to be dropped from document
# carriage-returns, line-feeds, tabs
codelist = ['\r', '\n', '\t']   

# We will not remove stopwords in this exercise because they are
# important to keeping sentences intact
if REMOVE_STOPWORDS:
    print(nltk.corpus.stopwords.words('english'))

# previous analysis of a list of top terms showed a number of words, along 
# with contractions and other word strings to drop from further analysis, add
# these to the usual English stopwords to be dropped from a document collection
    more_stop_words = ['cant','didnt','doesnt','dont','goes','isnt','hes',\
        'shes','thats','theres','theyre','wont','youll','youre','youve', 'br'\
        've', 're', 'vs'] 

    some_proper_nouns_to_remove = ['dick','ginger','hollywood','jack',\
        'jill','john','karloff','kudrow','orson','peter','tcm','tom',\
        'toni','welles','william','wolheim','nikita']

    # start with the initial list and add to it for movie text work 
    stoplist = nltk.corpus.stopwords.words('english') + more_stop_words +\
        some_proper_nouns_to_remove

In [159]:
# text parsing function for creating text documents 
# there is more we could do for data preparation 
# stemming... looking for contractions... possessives... 
# but we will work with what we have in this parsing function
# if we want to do stemming at a later time, we can use
#     porter = nltk.PorterStemmer()  
# in a construction like this
#     words_stemmed =  [porter.stem(word) for word in initial_words]  
def text_parse(string):
    # replace non-alphanumeric with space 
    temp_string = re.sub('[^a-zA-Z]', '  ', string)    
    # replace codes with space
    for i in range(len(codelist)):
        stopstring = ' ' + codelist[i] + '  '
        temp_string = re.sub(stopstring, '  ', temp_string)      
    # replace single-character words with space
    temp_string = re.sub('\s.\s', ' ', temp_string)   
    # convert uppercase to lowercase
    temp_string = temp_string.lower()    
    if REMOVE_STOPWORDS:
        # replace selected character strings/stop-words with space
        for i in range(len(stoplist)):
            stopstring = ' ' + str(stoplist[i]) + ' '
            temp_string = re.sub(stopstring, ' ', temp_string)        
    # replace multiple blank characters with one blank character
    temp_string = re.sub('\s+', ' ', temp_string)    
    return(temp_string)    

In [160]:
# -----------------------------------------------
# gather data for 500 negative movie reviews
# -----------------------------------------------
dir_name = 'movie-reviews-negative'
    
filenames = listdir_no_hidden(path=dir_name)
num_files = len(filenames)

for i in range(len(filenames)):
    file_exists = os.path.isfile(os.path.join(dir_name, filenames[i]))
    assert file_exists
print('\nDirectory:',dir_name)    
print('%d files found' % len(filenames))


Directory: movie-reviews-negative
500 files found


In [161]:
# Read data for negative movie reviews
# Data will be stored in a list of lists where the each list represents 
# a document and document is a list of words.
# We then break the text into words.

def read_data(filename):

  with open(filename, encoding='utf-8') as f:
    data = tf.compat.as_str(f.read())
    data = data.lower()
    data = text_parse(data)
    data = TreebankWordTokenizer().tokenize(data)  # The Penn Treebank

  return data

negative_documents = []

print('\nProcessing document files under', dir_name)
for i in range(num_files):
    ## print(' ', filenames[i])

    words = read_data(os.path.join(dir_name, filenames[i]))

    negative_documents.append(words)
    # print('Data size (Characters) (Document %d) %d' %(i,len(words)))
    # print('Sample string (Document %d) %s'%(i,words[:50]))


Processing document files under movie-reviews-negative


In [162]:
# -----------------------------------------------
# gather data for 500 positive movie reviews
# -----------------------------------------------
dir_name = 'movie-reviews-positive'  
filenames = listdir_no_hidden(path=dir_name)
num_files = len(filenames)

for i in range(len(filenames)):
    file_exists = os.path.isfile(os.path.join(dir_name, filenames[i]))
    assert file_exists
print('\nDirectory:',dir_name)    
print('%d files found' % len(filenames))

# Read data for positive movie reviews
# Data will be stored in a list of lists where the each list 
# represents a document and document is a list of words.
# We then break the text into words.

def read_data(filename):

  with open(filename, encoding='utf-8') as f:
    data = tf.compat.as_str(f.read())
    data = data.lower()
    data = text_parse(data)
    data = TreebankWordTokenizer().tokenize(data)  # The Penn Treebank

  return data

positive_documents = []

print('\nProcessing document files under', dir_name)
for i in range(num_files):
    ## print(' ', filenames[i])

    words = read_data(os.path.join(dir_name, filenames[i]))

    positive_documents.append(words)
    # print('Data size (Characters) (Document %d) %d' %(i,len(words)))
    # print('Sample string (Document %d) %s'%(i,words[:50]))
    #I guess the tokenizer outputs a list of strings, not an array or tokenizer oject


Directory: movie-reviews-positive
500 files found

Processing document files under movie-reviews-positive


In [163]:
positive_documents[0][0]

'bromwell'

In [164]:
# -----------------------------------------------------
# convert positive/negative documents into numpy array
# note that reviews vary from 22 to 1052 words   
# so we use the first 20 and last 20 words of each review 
# as our word sequences for analysis
# -----------------------------------------------------
max_review_length = 0  # initialize
for doc in negative_documents:
    max_review_length = max(max_review_length, len(doc))    
for doc in positive_documents:
    max_review_length = max(max_review_length, len(doc)) 
print('max_review_length:', max_review_length) 

max_review_length: 1052


In [165]:
min_review_length = max_review_length  # initialize
for doc in negative_documents:
    min_review_length = min(min_review_length, len(doc))    
for doc in positive_documents:
    min_review_length = min(min_review_length, len(doc)) 
print('min_review_length:', min_review_length) 

min_review_length: 22


In [166]:
# construct list of 1000 lists with 40 words in each list
from itertools import chain
documents = []
for doc in negative_documents:
    doc_begin = doc[0:20]
    doc_end = doc[len(doc) - 20: len(doc)]
    documents.append(list(chain(*[doc_begin, doc_end])))    
for doc in positive_documents:
    doc_begin = doc[0:20]
    doc_end = doc[len(doc) - 20: len(doc)]
    documents.append(list(chain(*[doc_begin, doc_end])))    

In [167]:
len(documents[0])

40

In [168]:
#remember you have a embedding matrix prepared with limited words of 10,000
limited_index_to_embedding[0,1]

-0.24487

In [169]:
# create list of lists of lists for embeddings
embeddings = [] 
sequences = []
for doc in documents:
    embedding = []
    sequence = []
    for word in doc:
       embedding.append(limited_index_to_embedding[limited_word_to_index[word]])
       sequence.append(limited_word_to_index[word]) 
    embeddings.append(embedding)
    sequences.append(sequence)
#embeddings has three dimensions
#sequences has two dimensions

In [170]:
len(embeddings[0][0])

100

In [171]:
type(embeddings)

list

In [172]:
sequences[0]

[523,
 3,
 300,
 38,
 31,
 28638,
 5044,
 10,
 9610,
 2383,
 66,
 17,
 875,
 1500,
 12,
 14,
 11025,
 880,
 3,
 12413,
 17,
 77,
 219,
 22181,
 21,
 581,
 353,
 100000,
 100000,
 581,
 1569,
 10945,
 23760,
 5,
 15678,
 16083,
 86,
 30,
 541,
 3442]

In [173]:
# -----------------------------------------------------    
# Make embeddings a numpy array for use in an RNN 
# Create training and test sets with Scikit Learn
# -----------------------------------------------------
embeddings_array = np.array(embeddings)
sequences_array =np.array(sequences)

# Define the labels to be used 500 negative (0) and 500 positive (1)
thumbs_down_up = np.concatenate((np.zeros((500), dtype = np.int32), 
                      np.ones((500), dtype = np.int32)), axis = 0)

## up to this point its all been about the embedding layer





# Scikit Learn for random splitting of the data  
from sklearn.model_selection import train_test_split

# Random splitting of the data in to training (80%) and test (20%)
#I ADJUST THE TRAIN_TEST_SPLIT FROM embeddings_array to sequences_array
X_train, X_test, y_train, y_test = \
    train_test_split(sequences_array, thumbs_down_up, test_size=0.20, 
                     random_state = RANDOM_SEED)

X_train.shape

#X_train is a list of 40 sequences that have vectors associated with them

(800, 40)

In [174]:
sum(y_train)/len(y_train)

0.5075

In [175]:
sum(y_test)/len(y_test)

0.47

In [176]:
## this is where I will start implementing my own KERAS code
## i need my X_train data to be in the right shape and form
##i need my Embedding Matrix to be set up correctly

In [177]:
len(limited_index_to_embedding)

100001

In [178]:
model =tf.keras.Sequential()

In [179]:
model.add(keras.layers.Embedding(EVOCABSIZE+1,#num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
                           100,
                           embeddings_initializer = keras.initializers.Constant(limited_index_to_embedding),
                           input_length = 40,
                           trainable=False)
         )

In [180]:
model.add(keras.layers.Bidirectional(keras.layers.LSTM(128)))
model.add(keras.layers.Dense(64,activation='relu'))
model.add(keras.layers.Dense(1))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 100)           10000100  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_4 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 10,251,109
Trainable params: 251,009
Non-trainable params: 10,000,100
_________________________________________________________________


In [181]:
model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True),
             optimizer = keras.optimizers.Adam(.0001),
             metrics=['accuracy'])

In [182]:
#figuring out what the inputs look like for this tf model

# import tensorflow_datasets as tfds
# dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True,
#                           as_supervised=True)
# train_dataset, test_dataset = dataset['train'], dataset['test']


In [184]:
model.fit(X_train,y_train,
         batch_size = 64,
         epochs = 30,
         validation_data = (X_test,y_test)
         )

Train on 800 samples, validate on 200 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2294fc90fc8>

In [None]:
#####
# all code from here and down is scrap

In [122]:
sequence_input = keras.layers.Input(shape = (40,),dtype='int32')

In [123]:
#I think this will end up looking like the embeddings 3d array from class code
embedded_sequences = embedding_layer(sequence_input)

In [125]:
x = keras.layers.Conv1D(128, 5, activation='relu')(embedded_sequences)
x = keras.layers.MaxPooling1D(5)(x)
x = keras.layers.Conv1D(128, 5, activation='relu')(x)
x = keras.layers.MaxPooling1D(5)(x)
x = keras.layers.Conv1D(128, 5, activation='relu')(x)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dense(0, activation='relu')(x)
preds = keras.layers.Dense(1)(x)

ValueError: Negative dimension size caused by subtracting 5 from 3 for 'max_pooling1d_1/MaxPool' (op: 'MaxPool') with input shapes: [?,3,1,128].

In [91]:
# -----------------------------------------------------    
# Check on the embeddings list of list of lists 
# -----------------------------------------------------
# Show the first word in the first document
test_word = documents[0][0]    
print('First word in first document:', test_word)    
print('Embedding for this word:\n', 
      limited_index_to_embedding[limited_word_to_index[test_word]])
print('Corresponding embedding from embeddings list of list of lists\n',
      embeddings[0][0][:])

First word in first document: story
Embedding for this word:
 [-1.6987e-01  7.4241e-01  4.3299e-01 -4.5484e-01  9.9715e-02  7.1426e-01
 -6.4353e-02 -2.5535e-01  2.5013e-01 -2.9634e-01  7.7283e-01  5.5974e-01
 -6.2684e-02  2.1245e-01 -1.1726e-01  6.0059e-01  2.9025e-01  5.7505e-02
 -1.5385e-01  3.2536e-01  5.8805e-01 -3.9970e-01 -3.6610e-01  2.6827e-01
  1.0531e+00 -2.2903e-01 -2.3219e-02 -3.0292e-01 -6.6937e-01  3.6806e-01
 -4.5865e-02  2.6706e-01 -1.0901e-01 -4.4617e-01 -5.7160e-01 -5.3780e-02
 -5.4940e-02  8.8485e-03  5.9055e-01 -8.7651e-02 -2.4036e-01  1.7260e-01
 -5.4645e-01  2.2699e-02  8.6777e-02 -1.6234e-02 -4.4207e-01 -4.9677e-01
  6.5760e-01 -1.9751e-01  2.5612e-01  3.6991e-02  1.0723e+00  1.3145e+00
 -2.9942e-02 -2.7365e+00 -1.5057e-01  2.7365e-01  1.0993e+00  4.0902e-02
 -4.8319e-01  1.6665e+00 -4.6939e-02 -7.2553e-01  1.4452e+00 -6.4141e-01
  6.4354e-01  4.6000e-01 -4.4806e-01  3.6381e-01  2.3893e-03  3.2208e-01
 -2.6913e-01 -1.1554e-01  5.3460e-01  3.1170e-01  3.5561e-01 -

In [92]:
# Show the seventh word in the tenth document
test_word = documents[6][9]    
print('First word in first document:', test_word)    
print('Embedding for this word:\n', 
      limited_index_to_embedding[limited_word_to_index[test_word]])
print('Corresponding embedding from embeddings list of list of lists\n',
      embeddings[6][9][:])

First word in first document: but
Embedding for this word:
 [-5.7078e-02  3.9874e-01  6.8861e-01 -6.8151e-01 -4.5583e-01  2.0080e-01
  1.7974e-01  5.3648e-02  4.3762e-01 -2.6725e-02  1.3383e-01 -7.8137e-03
  4.2207e-01 -3.1801e-01  1.8065e-01 -3.5387e-01 -3.0929e-01  4.0660e-02
 -4.8854e-01  3.7910e-01  4.7955e-01 -4.1942e-02  4.0894e-01  1.2419e-01
  4.0096e-01  1.9545e-01 -3.7819e-01 -7.7684e-01 -2.0677e-01 -4.3130e-01
 -1.0095e-01  3.9866e-01 -2.9612e-01 -8.3111e-02 -1.9026e-02  5.3927e-01
  1.1912e-03  3.0235e-01 -3.6048e-01 -4.8434e-01 -4.7751e-01 -3.3922e-01
  3.4788e-01 -1.7484e-01 -2.2613e-01 -3.2910e-01  8.1259e-01 -5.8452e-01
  1.4509e-01 -7.1497e-01  1.7107e-01 -2.4833e-01  2.2104e-01  1.5517e+00
  4.0869e-02 -2.9103e+00 -2.0812e-01 -1.7625e-01  1.6597e+00  8.6277e-01
 -3.2527e-01  6.5641e-01 -1.3142e-01  3.2312e-01  9.0836e-01 -2.9105e-01
  8.4975e-01  5.3217e-01  1.5041e-01 -2.7983e-01 -2.9015e-02 -6.3378e-01
  1.2237e-01 -7.9144e-01  1.6108e-01  1.7446e-02 -3.5095e-01 -1.

In [93]:
# Show the last word in the last document
test_word = documents[999][39]    
print('First word in first document:', test_word)    
print('Embedding for this word:\n', 
      limited_index_to_embedding[limited_word_to_index[test_word]])
print('Corresponding embedding from embeddings list of list of lists\n',
      embeddings[999][39][:])        

First word in first document: from
Embedding for this word:
 [ 3.0731e-01  2.4737e-01  6.8231e-01 -5.2367e-01  4.4053e-01  4.2044e-01
  2.5140e-04  1.5265e-01 -6.1363e-01  2.2631e-01  8.3071e-02  7.0425e-02
  1.7683e-02  5.6807e-01  1.0067e+00 -4.6206e-01  4.4524e-01 -5.0984e-01
 -4.2985e-01  1.9935e-01  2.2729e-01  5.1662e-01  5.6282e-01  4.1282e-01
  1.7742e-01 -1.5694e-01 -1.1505e-01 -3.8050e-01  4.7440e-01 -1.6686e-01
  2.3153e-01  6.3698e-02 -1.0716e-01 -2.6848e-01 -4.2665e-01  5.2237e-01
  9.5376e-02  6.4020e-01 -5.2221e-01 -1.3856e-01 -9.8307e-01 -3.5320e-01
 -5.2161e-01  1.1277e-01  3.1634e-01  1.3297e-01 -4.9571e-02 -1.3785e-01
  1.1317e-01 -5.0644e-01  3.8373e-01  3.6698e-01  3.9106e-01  9.8143e-01
 -5.4410e-01 -2.4640e+00 -6.8383e-01 -9.6243e-01  2.2017e+00  5.6643e-01
 -4.9410e-02  1.3093e+00 -4.0073e-01  8.3530e-01  1.7440e-01  4.4926e-02
  5.4118e-01 -1.1038e-01  3.8200e-01  1.5369e-01 -3.7072e-01 -1.3141e-01
 -5.2504e-01 -5.6775e-01 -1.6822e-01 -9.1726e-02  8.1418e-02  4

In [99]:
sequences[0][0]

523

In [100]:
# --------------------------------------------------------------------------      
# We use a very simple Recurrent Neural Network for this assignment
# Géron, A. 2017. Hands-On Machine Learning with Scikit-Learn & TensorFlow: 
#    Concepts, Tools, and Techniques to Build Intelligent Systems. 
#    Sebastopol, Calif.: O'Reilly. [ISBN-13 978-1-491-96229-9] 
#    Chapter 14 Recurrent Neural Networks, pages 390-391
#    Source code available at https://github.com/ageron/handson-ml
#    Jupyter notebook file 14_recurrent_neural_networks.ipynb
#    See section on Training an sequence Classifier, # In [34]:
#    which uses the MNIST case data...  we revise to accommodate
#    the movie review data in this assignment    
# --------------------------------------------------------------------------  
reset_graph()

NameError: name 'reset_graph' is not defined

In [34]:
n_steps = embeddings_array.shape[1]  # number of words per document 
n_inputs = embeddings_array.shape[2]  # dimension of  pre-trained embeddings
n_neurons = 20  # analyst specified number of neurons
n_outputs = 2  # thumbs-down or thumbs-up

learning_rate = 0.001

In [41]:
n_steps

40

In [35]:
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

In [36]:
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

AttributeError: module 'tensorflow_core.compat.v1' has no attribute 'contrib'

In [None]:
logits = tf.layers.dense(states, n_outputs)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                          logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

n_epochs = 50
batch_size = 100

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        print('\n  ---- Epoch ', epoch, ' ----\n')
        for iteration in range(y_train.shape[0] // batch_size):          
            X_batch = X_train[iteration*batch_size:(iteration + 1)*batch_size,:]
            y_batch = y_train[iteration*batch_size:(iteration + 1)*batch_size]
            print('  Batch ', iteration, ' training observations from ',  
                  iteration*batch_size, ' to ', (iteration + 1)*batch_size-1,)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print('\n  Train accuracy:', acc_train, 'Test accuracy:', acc_test)

In [42]:
model = keras.Sequential([
  keras.layers.Conv2D(32, kernel_size=3, activation=  'relu', input_shape=(40,50)),
  keras.layers.GlobalAveragePooling1D(),
  keras.layers.Dense(16, activation='relu'),
  keras.layers.Dense(1)
])

model.summary()

NameError: name 'layers' is not defined