In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils import *
import pandas as pd
import numpy as np

np.random.seed(4266)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform


Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
np.random

<module 'numpy.random' from '/Users/lucasosouza/anaconda/envs/udacity/lib/python3.5/site-packages/numpy/random/__init__.py'>

In [4]:
# %pdb

In [5]:
glove = '../glove_s300.txt'
log = '../log_glove.txt'
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs(glove, log)

In [6]:
word = "constitucional"
index = 28989
print("the index of", word, "in the vocabulary is", words_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_words[index])

the index of constitucional in the vocabulary is 202317
the 28989th word in the vocabulary is afrobrasileiras


In [7]:
# load X and Y
max_len = 100
data_files = !ls ../data
X, Y = getXY(data_files)
Y_ohe, Y_idx, klass_to_idx, idx_to_klass = convert_to_one_hot(Y)
X_indices = sentences_to_indices(X, words_to_index, max_len)

In [8]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        try:
            emb_matrix[index, :] = word_to_vec_map[word]
        except:
            pass

    # Define Keras embedding layer with the correct output/input sizes. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [9]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

weights[0][1][3] = 0.066613


In [10]:
def create_model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape=input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128, return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(17)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(input=sentence_indices, output=X)
    
    return model

In [11]:
model = create_model((max_len,), word_to_vec_map, words_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          279838200 
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 17)                2193      
__________



In [12]:
X_train, X_test, Y_train, Y_test = splitXY(X_indices, Y_ohe)

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.fit(X_train, Y_train, epochs=2, batch_size=64, shuffle=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x131477f60>

In [15]:
model.evaluate(X_test, Y_test)



[2.29007521344575, 0.26108578534181615]

In [16]:
model.evaluate(X_train, Y_train)



[2.2613802489949695, 0.2650290215588723]

In [17]:
model.evaluate(X_indices, Y_ohe)



[2.2671211513454135, 0.2642401127598271]