In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
from scipy.spatial.distance import cdist

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [102]:
df = pd.read_csv("training.txt",sep="	", header=None)
unlabeltext  = pd.read_fwf('testdata.txt')

#Inputs and Output
X = np.array(df[1])
y = np.array(df[0])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20,random_state = 42)

unlabeltext.fillna('None',inplace=True)
#unlabeltext.replace('None', np.nan, inplace=True)
unlabeltext  = np.array(unlabeltext)
print(X)

['The Da Vinci Code book is just awesome.'
 "this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this."
 'i liked the Da Vinci Code a lot.' ...
 'As I sit here, watching the MTV Movie Awards, I am reminded of how much I despised the movie Brokeback Mountain.'
 'Ok brokeback mountain is such a horrible movie.'
 'Oh, and Brokeback Mountain was a terrible movie.']


In [82]:
#cleaning data
#Tokenzier
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X)
x_train_tokens = tokenizer.texts_to_sequences(X_train)
x_test_tokens  = tokenizer.texts_to_sequences(X_test)

#print(tokenizer.word_index)
print("Train-set:  ", (X_train[0]))
print("                                                                      ")
print("Train-tokenized-set:  ", (x_train_tokens[0]))

Train-set:   I really like The Da Vinci Code.
                                                                      
Train-tokenized-set:   [1, 32, 18, 2, 7, 6, 8]


In [101]:
#Padding and Truncating Data¶
#The Recurrent Neural Network can take sequences of arbitrary length as input

#First we count the number of tokens in all the sequences in the data-set.
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

print("The average number of tokens in a sequence is:  ", (np.mean(num_tokens)))
print("The maximum number of tokens in a sequence is:  ", (np.max(num_tokens)))

#The max number of tokens we will allow is set to the average plus 2 standard deviations.
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print("The max number of tokens we will allow is set to the average plus 2 sd  ", (max_tokens))
print("This covers about 99% of the data-set:  ", (np.sum(num_tokens < max_tokens) / len(num_tokens)))


The average number of tokens in a sequence is:   11.072997976293726
The maximum number of tokens in a sequence is:   933
The max number of tokens we will allow is set to the average plus 2 sd   40
This covers about 99% of the data-set:   0.9985544955189362


In [66]:
#padding or truncating the sequences that have a different length, 
#we need to determine if we want to do this padding or truncating 'pre' or 'post'
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
x_test_pad  = pad_sequences(x_test_tokens,  maxlen=max_tokens,padding=pad, truncating=pad)

#We have now transformed the data into one big matrix of integers (tokens) with this shape:
print("The train-set is transformed into one big matrix of integers (tokens)", (x_train_pad.shape))
print("The test -set is transformed into one big matrix of integers (tokens)", (x_test_pad.shape))

#Padding result
print("                                                                      ")
print("Tokenized training data", (np.array(x_train_tokens[0])))
print("                                                                      ")
print("Padded    training data", (x_train_pad[0]))


The train-set is transformed into one big matrix of integers (tokens) (5534, 40)
The test -set is transformed into one big matrix of integers (tokens) (1384, 40)
                                                                      
Tokenized training data [ 1 32 18  2  7  6  8]
                                                                      
Padded    training data [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  1 32 18  2  7  6  8]


In [67]:
#Tokenizer Inverse Map: Converting tokenized back to original text.
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

#Helper-function for converting a list of tokens back to a string of words.
def tokens_to_string(tokens):

    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]

    # Concatenate all words.
    text = " ".join(words)
    return text

print("Train-set:  ", (X_train[0]))
print("                                                                      ")
print("Tokenized text converted back to original: ", (tokens_to_string(x_train_tokens[0])))

Train-set:   I really like The Da Vinci Code.
                                                                      
Tokenized text converted back to original:  i really like the da vinci code


In [69]:
#Create the Recurrent Neural Network Model

model = Sequential()
embedding_size = 10

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=1e-3)

tf.logging.set_verbosity(tf.logging.ERROR)

In [70]:
#Compiling RNN model

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model.summary()
tf.logging.set_verbosity(tf.logging.ERROR)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 40, 10)            100000    
_________________________________________________________________
gru_4 (GRU)                  (None, None, 16)          1296      
_________________________________________________________________
gru_5 (GRU)                  (None, None, 8)           600       
_________________________________________________________________
gru_6 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5         
Total params: 102,057
Trainable params: 102,057
Non-trainable params: 0
_________________________________________________________________


In [71]:
#Training the Recurrent Neural Network¶

model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 5257 samples, validate on 277 samples
Epoch 1/3

Epoch 2/3

Epoch 3/3



<tensorflow.python.keras._impl.keras.callbacks.History at 0x14e678db780>

In [72]:
#Performance on Test-Set¶
result = model.evaluate(x_test_pad, y_test)
print("Accuracy: {0:.2%}".format(result[1]))


Accuracy: 95.30%


In [73]:
#Predicted sentiment for the first 1000 texts in the test-set.
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

#These predicted numbers fall between 0.0 and 1.0.
#We use a cutoff / threshold and say that all values above 0.5 are taken to be 1.0
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

#The true "class" for the first 1000 texts in the test-set are needed for comparison.
cls_true = np.array(y_test[0:1000])

#We can then get indices for all the texts that were incorrectly classified by comparing all the "classes" of these two arrays.
incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

#Of the 1000 texts used, how many were mis-classified?
print("Number of Mis-classified texts ", (len(incorrect)))

#Let us look at the first mis-classified text.
print("Index of first mis-classified text ", (incorrect[0]))
idx = incorrect[0]

Number of Mis-classified texts  50
Index of first mis-classified text  15


In [75]:
#Predicted and true classes for the text:

print("Predicted  label: ", (y_pred[idx]))
print("True class label: ", (cls_true[idx]))
print("                                                                      ")
print("Misclassified text: ", (X_test[idx]))

Predicted  label:  0.13240704
True class label:  1
                                                                      
Misclassified text:  Harry Potter is brilliant.


In [98]:
#Fitting our model on unlabelled text
tokens = tokenizer.texts_to_sequences(unlabeltext.ravel())
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [99]:
model.predict(tokens_pad)

array([[0.39652243],
       [0.0953111 ],
       [0.0953111 ],
       ...,
       [0.0953111 ],
       [0.0953111 ],
       [0.0953111 ]], dtype=float32)

In [44]:
#References:
#This is an in-class contest hosted by University of Michigan SI650 (Information Retrieval)
#https://www.kaggle.com/c/si650winter11/data
#https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/20_Natural_Language_Processing.ipynb