In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
from scipy.spatial.distance import cdist

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [56]:
df = pd.read_csv("labeledTrainData.csv")
unlabeltext = pd.read_csv("testData.csv")

#Inputs and Output
X = np.array(df['review'])
y = np.array(df['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20,random_state = 42)

#unlabeled dataset
unlabeltext  = np.array(unlabeltext['review'])

In [45]:
#cleaning data
#Tokenzier
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X)
x_train_tokens = tokenizer.texts_to_sequences(X_train)
x_test_tokens  = tokenizer.texts_to_sequences(X_test)

#print(tokenizer.word_index)
print("Train-set:  ", (X_train[0]))
print("                                                                      ")
print("Train-tokenized-set:  ", (x_train_tokens[0]))

Train-set:   This movie is just plain dumb.<br /><br />From the casting of Ralph Meeker as Mike Hammer to the fatuous climax, the film is an exercise in wooden predictability.<br /><br />Mike Hammer is one of detective fiction's true sociopaths. Unlike Marlow and Spade, who put pieces together to solve the mystery, Hammer breaks things apart to get to the truth. This film turns Hammer into a boob by surrounding him with bad guys who are ... well, too dumb to get away with anything. One is so poorly drawn that he succumbs to a popcorn attack.<br /><br />Other parts of the movie are right out of the Three Stooges play book. Velda's dance at the barre, for instance, or the bad guy who accidentally stabs his boss in the back. And the continuity breaks are shameful: Frau Blucher is running down the centerline of the road when the camera is tight on her lower legs but she's way over the side when the camera pulls back for a wider shot. The worst break, however, precedes the popcorn attack. T

In [9]:
#Padding and Truncating Data¶
#The Recurrent Neural Network can take sequences of arbitrary length as input

#First we count the number of tokens in all the sequences in the data-set.
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

print("The average number of tokens in a sequence is:  ", (np.mean(num_tokens)))
print("The maximum number of tokens in a sequence is:  ", (np.max(num_tokens)))

#The max number of tokens we will allow is set to the average plus 2 standard deviations.
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print("The max number of tokens we will allow is set to the average plus 2 sd  ", (max_tokens))
print("This covers about 95% of the data-set:  ", (np.sum(num_tokens < max_tokens) / len(num_tokens)))


The average number of tokens in a sequence is:   223.7972
The maximum number of tokens in a sequence is:   2193
The max number of tokens we will allow is set to the average plus 2 sd   551
This covers about 95% of the data-set:   0.94484


In [14]:
#padding or truncating the sequences that have a different length, 
#we need to determine if we want to do this padding or truncating 'pre' or 'post'
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
x_test_pad  = pad_sequences(x_test_tokens,  maxlen=max_tokens,padding=pad, truncating=pad)

#We have now transformed the data into one big matrix of integers (tokens) with this shape:
print("The train-set is transformed into one big matrix of integers (tokens)", (x_train_pad.shape))
print("The test -set is transformed into one big matrix of integers (tokens)", (x_test_pad.shape))

#Padding result
print("                                                                      ")
print("Tokenized training data", (np.array(x_train_tokens[0])))
print("                                                                      ")
print("Padded    training data", (x_train_pad[0]))


The train-set is transformed into one big matrix of integers (tokens) (20000, 551)
The test -set is transformed into one big matrix of integers (tokens) (5000, 551)
                                                                      
Tokenized training data [  11   17    6   40 1041  989    7    7   36    1  973    4 3168   14
 1946 4225    5    1 1326    1   19    6   32 3453    8 1637 8654    7
    7 1946 4225    6   28    4 1252  280 1021    2 8373   34  273 1323
  291    5 3318    1  732 4225 2027  180  969    5   76    5    1  879
   11   19  502 4225   80    3   31 3394   87   16   75  491   34   23
   70   96  989    5   76  242   16  232   28    6   35  859 1307   12
   27    5    3 3939 1271    7    7   82  528    4    1   17   23  205
   43    4    1  288 4656  294  271  833   30    1   15 1821   39    1
   75  229   34 2503 8530   24 1422    8    1  142    2    1 2382 2027
   23 7849    6  617  177    1    4    1 1314   51    1  367    6 2694
   20   38 2368 2976   18  437

In [17]:
#Tokenizer Inverse Map: Converting tokenized back to original text.
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

#Helper-function for converting a list of tokens back to a string of words.
def tokens_to_string(tokens):

    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]

    # Concatenate all words.
    text = " ".join(words)
    return text

print("Train-set:  ", (X_train[0]))
print("                                                                      ")
print("Tokenized text converted back to original: ", (tokens_to_string(x_train_tokens[0])))

Train-set:   This movie is just plain dumb.<br /><br />From the casting of Ralph Meeker as Mike Hammer to the fatuous climax, the film is an exercise in wooden predictability.<br /><br />Mike Hammer is one of detective fiction's true sociopaths. Unlike Marlow and Spade, who put pieces together to solve the mystery, Hammer breaks things apart to get to the truth. This film turns Hammer into a boob by surrounding him with bad guys who are ... well, too dumb to get away with anything. One is so poorly drawn that he succumbs to a popcorn attack.<br /><br />Other parts of the movie are right out of the Three Stooges play book. Velda's dance at the barre, for instance, or the bad guy who accidentally stabs his boss in the back. And the continuity breaks are shameful: Frau Blucher is running down the centerline of the road when the camera is tight on her lower legs but she's way over the side when the camera pulls back for a wider shot. The worst break, however, precedes the popcorn attack. T

In [46]:
#Create the Recurrent Neural Network Model

model = Sequential()
embedding_size = 10

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=1e-3)

tf.logging.set_verbosity(tf.logging.ERROR)

In [47]:
#Compiling RNN model

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model.summary()
tf.logging.set_verbosity(tf.logging.ERROR)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 551, 10)           100000    
_________________________________________________________________
gru_13 (GRU)                 (None, None, 16)          1296      
_________________________________________________________________
gru_14 (GRU)                 (None, None, 8)           600       
_________________________________________________________________
gru_15 (GRU)                 (None, 4)                 156       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 5         
Total params: 102,057
Trainable params: 102,057
Non-trainable params: 0
_________________________________________________________________


In [48]:
#Training the Recurrent Neural Network¶

model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 19000 samples, validate on 1000 samples
Epoch 1/3

Epoch 2/3

Epoch 3/3



<tensorflow.python.keras._impl.keras.callbacks.History at 0x1cd6def8ba8>

In [50]:
#Performance on Test-Set¶
result = model.evaluate(x_test_pad, y_test)
print("Accuracy: {0:.2%}".format(result[1]))


Accuracy: 84.32%


In [51]:
#Predicted sentiment for the first 1000 texts in the test-set.
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

#These predicted numbers fall between 0.0 and 1.0.
#We use a cutoff / threshold and say that all values above 0.5 are taken to be 1.0
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

#The true "class" for the first 1000 texts in the test-set are needed for comparison.
cls_true = np.array(y_test[0:1000])

#We can then get indices for all the texts that were incorrectly classified by comparing all the "classes" of these two arrays.
incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

#Of the 1000 texts used, how many were mis-classified?
print("Number of Mis-classified texts ", (len(incorrect)))

#Let us look at the first mis-classified text.
print("Index of first mis-classified text ", (incorrect[0]))
idx = incorrect[0]

Number of Mis-classified texts  156
Index of first mis-classified text  18


In [52]:
#Predicted and true classes for the text:

print("Predicted  label ", (y_pred[idx]))
print("True class label ", (cls_true[idx]))
print("                                                                      ")
print("Misclassified text ", (X_test[idx]))

Predicted  label  0.58853066
True class label  0
                                                                      
Misclassified text  The 1930s saw a vogue for documentary films about remote corners of the world, with an emphasis on wild animals, exotic terrain and primitive people with unusual cultures. Despite the logistics of transporting a film crew to a distant and dangerous place, and then bringing 'em back alive (with the film footage), such films were often much cheaper to make than were conventional Hollywood features ... because there were no expensive sets, costumes, or high-priced movie stars.<br /><br />The most successful makers of such films (artistically and financially) were the team of Martin E. Johnson and his wife Osa, who made several documentaries (sometimes with blatantly staged events) in Africa and Asia. The Johnsons' safari films were extremely popular, inspiring several parodies ... most notably Wheeler & Woolsey's \So This is Africa\", in which the ver

In [53]:
#Fitting our model on unlabelled text
tokens = tokenizer.texts_to_sequences(unlabeltext)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [54]:
model.predict(tokens_pad)

array([[0.948469  ],
       [0.07643753],
       [0.8616894 ],
       ...,
       [0.44489792],
       [0.9364726 ],
       [0.5162722 ]], dtype=float32)

In [44]:
#References:
#https://www.kaggle.com/c/word2vec-nlp-tutorial/data
#https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/20_Natural_Language_Processing.ipynb