In [1]:
import numpy as np
import pandas as pd
from utils import *
import matplotlib.pyplot as plt

from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

%matplotlib inline

In [2]:
train_df = pd.read_csv('data/train.tsv',sep='\t')
test_df = pd.read_csv('data/test.tsv',sep='\t')
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [4]:
#do not need the phrase_id and sentence_id column
train_data = train_df.drop(['PhraseId','SentenceId'],axis=1)
test_data = test_df.drop(['PhraseId','SentenceId'],axis=1)

In [5]:
train_data.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [6]:
train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data = train_data.iloc[:20000]
print(train_data.shape)

(20000, 2)


In [7]:
train_data['Phrase'] = train_data['Phrase'].apply(lambda x: ' '.join(text_cleaning(x)))
test_data['Phrase'] = test_data['Phrase'].apply(lambda x: ' '.join(text_cleaning(x)))

In [8]:
X_train_data, Y_train_data = train_data['Phrase'],train_data['Sentiment']
X_train_data.head()

0                               any summer blockbuster
1    to the stories and faces and music of the men ...
2                                               spears
3    pure venality that s giving it the old college...
4            watching past the second commercial break
Name: Phrase, dtype: object

In [9]:
Y_train_data.head()

0    3
1    2
2    2
3    2
4    2
Name: Sentiment, dtype: int64

In [10]:
classes = len(Y_train_data.unique()) #5 classes
#print(max_len)
print(classes)


5


In [11]:
X_train = np.asarray(X_train_data)
Y_train = np.asarray(Y_train_data)
max_len = 200 #len(max(X_train, key=len).split()) maximum_length of a sentence
print(X_train.shape)
print(Y_train.shape)
print(max_len)

(20000,)
(20000,)
200


In [12]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [13]:
print (len(word_to_index))

400000


In [14]:
def sentences_to_indices(X, word_to_index, max_len):

    m = X.shape[0]                                   # number of training examples
    X_indices = np.zeros((m, max_len))
    
    for i in range(0,m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words =X[i].lower().split()
        #j = 0
        for j, w in enumerate(sentence_words):
            try:
                X_indices[i, j] = word_to_index[w]
            except KeyError:
                X_indices[i,j] = 400000
    
    return X_indices

In [15]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)


    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [16]:
def Sentiment_Model(input_shape, word_to_vec_map, word_to_index):
    # Define sentence_indices as the input of the graph
    # It should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)    
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128, return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(5)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
    
    
    return model

In [17]:
model = Sentiment_Model((max_len,), word_to_vec_map, word_to_index)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 50)           20000050  
_________________________________________________________________
lstm (LSTM)                  (None, 200, 128)          91648     
_________________________________________________________________
dropout (Dropout)            (None, 200, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 645   

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
print(X_train)

['any summer blockbuster'
 'to the stories and faces and music of the men who are its subject'
 'spears' ... 'like a postcard' 'lrb too rrb short' 'in terms']


In [20]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_len)
Y_oh_train = convert_to_one_hot(Y_train, C = classes)

In [21]:
model.fit(X_train_indices, Y_oh_train, epochs = 20, batch_size = 32, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1fb9c8f6dc0>

In [None]:
X_test = np.asarray(test_data['Phrase'])
#let's see the predictions
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len)
pred = model.predict(X_test_indices)

In [None]:
print (pred.shape)

In [None]:
print(pred)

In [None]:
#use argmax to extract maximum value for a sample
res = []
for i in range(len(pred)):
    val = np.argmax(pred[i])
    res.append(val)
