In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
from scipy.spatial.distance import cdist

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [115]:
df = pd.read_csv("labeledTrainData.csv")
test1 = pd.read_csv("testData.csv")

#Inputs and Output
X = np.array(df['review'])
y = np.array(df['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20,random_state = 42)

#unlabeled dataset
test  = np.array(test1['review'])

In [68]:
#cleaning data
#Tokenzier
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X)

In [88]:
#print(tokenizer.word_index)

In [77]:
x_train_tokens = tokenizer.texts_to_sequences(X_train)
x_test_tokens = tokenizer.texts_to_sequences(X_test)

In [76]:
print("Train-set:  ", (X_train[0]))
print("Train-token-set:  ", (x_train_tokens[0]))

Train-set:   This movie is just plain dumb.<br /><br />From the casting of Ralph Meeker as Mike Hammer to the fatuous climax, the film is an exercise in wooden predictability.<br /><br />Mike Hammer is one of detective fiction's true sociopaths. Unlike Marlow and Spade, who put pieces together to solve the mystery, Hammer breaks things apart to get to the truth. This film turns Hammer into a boob by surrounding him with bad guys who are ... well, too dumb to get away with anything. One is so poorly drawn that he succumbs to a popcorn attack.<br /><br />Other parts of the movie are right out of the Three Stooges play book. Velda's dance at the barre, for instance, or the bad guy who accidentally stabs his boss in the back. And the continuity breaks are shameful: Frau Blucher is running down the centerline of the road when the camera is tight on her lower legs but she's way over the side when the camera pulls back for a wider shot. The worst break, however, precedes the popcorn attack. T

In [79]:
#Padding and Truncating Data¶
#The Recurrent Neural Network can take sequences of arbitrary length as input

#First we count the number of tokens in all the sequences in the data-set.
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [80]:
#The average number of tokens in a sequence is:
np.mean(num_tokens)

223.7972

In [81]:
#The maximum number of tokens in a sequence is:
np.max(num_tokens)

2193

In [83]:
#The max number of tokens we will allow is set to the average plus 2 standard deviations.
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

551

In [84]:
#This covers about 95% of the data-set.
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.94484

In [86]:
#padding or truncating the sequences that have a different length, 
#we need to determine if we want to do this padding or truncating 'pre' or 'post'
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,padding=pad, truncating=pad)

#We have now transformed the training-set into one big matrix of integers (tokens) with this shape:
x_train_pad.shape


(5000, 551)

In [87]:
#The matrix for the test-set has the same shape:
x_test_pad.shape

(5000, 551)

In [89]:
#For example, we had the following sequence of tokens above:
np.array(x_train_tokens[0])

array([  11,   17,    6,   40, 1041,  989,    7,    7,   36,    1,  973,
          4, 3168,   14, 1946, 4225,    5,    1, 1326,    1,   19,    6,
         32, 3453,    8, 1637, 8654,    7,    7, 1946, 4225,    6,   28,
          4, 1252,  280, 1021,    2, 8373,   34,  273, 1323,  291,    5,
       3318,    1,  732, 4225, 2027,  180,  969,    5,   76,    5,    1,
        879,   11,   19,  502, 4225,   80,    3,   31, 3394,   87,   16,
         75,  491,   34,   23,   70,   96,  989,    5,   76,  242,   16,
        232,   28,    6,   35,  859, 1307,   12,   27,    5,    3, 3939,
       1271,    7,    7,   82,  528,    4,    1,   17,   23,  205,   43,
          4,    1,  288, 4656,  294,  271,  833,   30,    1,   15, 1821,
         39,    1,   75,  229,   34, 2503, 8530,   24, 1422,    8,    1,
        142,    2,    1, 2382, 2027,   23, 7849,    6,  617,  177,    1,
          4,    1, 1314,   51,    1,  367,    6, 2694,   20,   38, 2368,
       2976,   18,  437,   93,  117,    1,  496,   

In [90]:
x_train_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [91]:
#Tokenizer Inverse Map
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [93]:
#Helper-function for converting a list of tokens back to a string of words.
def tokens_to_string(tokens):

    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]

    # Concatenate all words.
    text = " ".join(words)
    return text

In [95]:
print("Train-set:  ", (X_train[0]))
print("Train-token-set:  ", (tokens_to_string(x_train_tokens[0])))

Train-set:   This movie is just plain dumb.<br /><br />From the casting of Ralph Meeker as Mike Hammer to the fatuous climax, the film is an exercise in wooden predictability.<br /><br />Mike Hammer is one of detective fiction's true sociopaths. Unlike Marlow and Spade, who put pieces together to solve the mystery, Hammer breaks things apart to get to the truth. This film turns Hammer into a boob by surrounding him with bad guys who are ... well, too dumb to get away with anything. One is so poorly drawn that he succumbs to a popcorn attack.<br /><br />Other parts of the movie are right out of the Three Stooges play book. Velda's dance at the barre, for instance, or the bad guy who accidentally stabs his boss in the back. And the continuity breaks are shameful: Frau Blucher is running down the centerline of the road when the camera is tight on her lower legs but she's way over the side when the camera pulls back for a wider shot. The worst break, however, precedes the popcorn attack. T

In [96]:
#Create the Recurrent Neural Network¶

model = Sequential()
embedding_size = 8

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=1e-3)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [97]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [98]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 551, 8)            80000     
_________________________________________________________________
gru_1 (GRU)                  (None, None, 16)          1200      
_________________________________________________________________
gru_2 (GRU)                  (None, None, 8)           600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


In [99]:
#Train the Recurrent Neural Network¶

model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=1, batch_size=64)

Train on 19000 samples, validate on 1000 samples
Epoch 1/3

Epoch 2/3

Epoch 3/3



<tensorflow.python.keras._impl.keras.callbacks.History at 0x2a990112748>

In [100]:
#Performance on Test-Set¶
result = model.evaluate(x_test_pad, y_test)




In [101]:
print("Accuracy: {0:.2%}".format(result[1]))


Accuracy: 82.56%


In [102]:
#Example of Mis-Classified Text
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

In [105]:
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])
cls_true = np.array(y_test[0:1000])


incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]
len(incorrect)

184

In [106]:
idx = incorrect[0]

idx

29

In [108]:
#mis0classified text
text = X_test[idx]
text

'The key to The 40-Year-Old Virgin is not merely that Andy Stitzer is a 40-year-old virgin, but rather the manner in which Steve Carell presents him as one. In a genre of crass \\comedy\\" that has become typified by its lack of humor and engaging characters, The 40-Year-Old Virgin offers a colorful cast and an intelligent, heartfelt script that doesn\'t use its protagonist as the butt-end of cruel jokes. That Andy is still a virgin at forty years old is not as much a joke, in fact, as it is a curiosity.<br /><br />Carell, a veteran of Team Ferrell in Anchorman and an ex-Daily Show castmember, uses the concept of the film to expand his character Â– we get to understand why Andy is the way he is. It\'s the little things that make this film work. When Andy\'s co-worker at an electronics store asks him what he did for the weekend, Andy describes his failed efforts at cooking. When Andy rides his bike to work, he signals his turns. He doesn\'t just adorn his home with action figures Â– he 

In [110]:
#these are the predicted and true classes for the text:
y_pred[idx]

0.27526477

In [111]:
cls_true[idx]

1

In [116]:
#Lets us try on new data
tokens = tokenizer.texts_to_sequences(test)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
tokens_pad.shape

(25000, 551)

In [117]:
model.predict(tokens_pad)

array([[0.9407548 ],
       [0.05553144],
       [0.8264502 ],
       ...,
       [0.13183978],
       [0.9346127 ],
       [0.20734578]], dtype=float32)

In [None]:
#https://www.kaggle.com/c/word2vec-nlp-tutorial/data