In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
from scipy.spatial.distance import cdist

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
df = pd.read_csv("training.txt",sep="	", header=None)
unlabeltext  = pd.read_fwf('testdata.txt')

#Inputs and Output
X = np.array(df[1])
y = np.array(df[0])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20,random_state = 42)

unlabeltext.fillna('None',inplace=True)
#unlabeltext.replace('None', np.nan, inplace=True)
unlabeltext  = np.array(unlabeltext)

In [4]:
#cleaning data
#Tokenzier
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X)
x_train_tokens = tokenizer.texts_to_sequences(X_train)
x_test_tokens  = tokenizer.texts_to_sequences(X_test)

#print(tokenizer.word_index)
print("Train-set:  ", (X_train[0]))
print("                                                                      ")
print("Train-tokenized-set:  ", (x_train_tokens[0]))

Train-set:   I really like The Da Vinci Code.
                                                                      
Train-tokenized-set:   [1, 32, 18, 2, 7, 6, 8]


In [5]:
#Padding and Truncating Data¶
#The Recurrent Neural Network can take sequences of arbitrary length as input

#First we count the number of tokens in all the sequences in the data-set.
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

print("The average number of tokens in a sequence is:  ", (np.mean(num_tokens)))
print("The maximum number of tokens in a sequence is:  ", (np.max(num_tokens)))

#The max number of tokens we will allow is set to the average plus 2 standard deviations.
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print("The max number of tokens we will allow is set to the average plus 2 sd  ", (max_tokens))
print("This covers about 99% of the data-set:  ", (np.sum(num_tokens < max_tokens) / len(num_tokens)))


The average number of tokens in a sequence is:   11.072997976293726
The maximum number of tokens in a sequence is:   933
The max number of tokens we will allow is set to the average plus 2 sd   40
This covers about 99% of the data-set:   0.9985544955189362


In [6]:
#padding or truncating the sequences that have a different length, 
#we need to determine if we want to do this padding or truncating 'pre' or 'post'
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
x_test_pad  = pad_sequences(x_test_tokens,  maxlen=max_tokens,padding=pad, truncating=pad)

#We have now transformed the data into one big matrix of integers (tokens) with this shape:
print("The train-set is transformed into one big matrix of integers (tokens)", (x_train_pad.shape))
print("The test -set is transformed into one big matrix of integers (tokens)", (x_test_pad.shape))

#Padding result
print("                                                                      ")
print("Tokenized training data", (np.array(x_train_tokens[0])))
print("                                                                      ")
print("Padded    training data", (x_train_pad[0]))


The train-set is transformed into one big matrix of integers (tokens) (5534, 40)
The test -set is transformed into one big matrix of integers (tokens) (1384, 40)
                                                                      
Tokenized training data [ 1 32 18  2  7  6  8]
                                                                      
Padded    training data [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  1 32 18  2  7  6  8]


In [11]:
#Tokenizer Inverse Map: Converting tokenized back to original text.
word_index = tokenizer.word_index
inverse_map = dict(zip(word_index.values(), word_index.keys()))

#Helper-function for converting a list of tokens back to a string of words.
def tokens_to_string(tokens):

    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]

    # Concatenate all words.
    text = " ".join(words)
    return text

print("Train-set:  ", (X_train[0]))
print("                                                                      ")
print("Tokenized text converted back to original: ", (tokens_to_string(x_train_tokens[0])))

Train-set:   I really like The Da Vinci Code.
                                                                      
Tokenized text converted back to original:  i really like the da vinci code


In [None]:
#Creating mini-batches

def get_batches(x, y, batch_size):

    '''Create the batches for the training and validation data'''
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

def get_test_batches(x, batch_size):

    '''Create the batches for the testing data'''
    n_batches = len(x)//batch_size
    x = x[:n_batches*batch_size]

    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size]

In [48]:
def multilinear_regression():
    
    n_words = len(word_index)
    embed_size = 300
    lstm_size  = 128
    num_layers = 2
    fc_units  = 256
    keep_prob = 1

    #x = tf.placeholder(tf.float32,[None,n],name='x')
    #y = tf.placeholder(tf.float32,[None,n_classes],name='y')
    
    x = tf.placeholder(tf.int32, [None, None], name='x')
    y = tf.placeholder(tf.int32, [None, None], name='y')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    #weights={'w1':tf.Variable(tf.ones((n_input,n_hidden)))}
    #biases={'b1':tf.Variable(tf.zeros((n_hidden)))}
    
    weights = tf.truncated_normal_initializer(stddev=0.1)
    biases  = tf.zeros_initializer()
        
    embedding = tf.Variable(tf.random_uniform((n_words,embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, x)
    
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
    cell_wrapped = tf.contrib.rnn.OutputProjectionWrapper(cell, output_size=1)
    
    outputs, final_state = tf.nn.dynamic_rnn(cell_wrapped,embed,dtype=tf.float32)

    dense = tf.contrib.layers.fully_connected(outputs[-1],
                    num_outputs = fc_units,
                    activation_fn = tf.sigmoid,
                    weights_initializer = weights,
                    biases_initializer = biases)

    dense = tf.contrib.layers.dropout(dense, keep_prob)
    
    predictions = tf.contrib.layers.fully_connected(dense, 
                          num_outputs = 1, 
                          activation_fn = tf.sigmoid,
                          weights_initializer = weights,
                          biases_initializer = biases)
    
    cost = tf.losses.mean_squared_error(y, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
        
    return cost,weights,biases,x,y,predictions,optimizer

In [57]:
def run(epochs):
    
    loss,w,b,x,y,y_pred,optimizer = multilinear_regression()
    cost_hist = np.empty(shape=[1],dtype = float)

    with tf.Session() as session:
        
        init = tf.global_variables_initializer()
        session.run(init)
    
        feed_dict = {x:X_train,y:y_train}
        for i in range(epochs):
            session.run(optimizer,feed_dict)
            
            cost = session.run(loss,feed_dict)
            cost_hist = np.append(cost_hist,cost)
            
            #state = sess.run(model.initial_state)
            #feed_dict = {model.inputs: X_train,
             #               model.labels: y_train[:, None],
              #              model.keep_prob: dropout,
               #             model.initial_state: state}

            #y_pred_test = session.run(y_pred,{x:X_test})
            #accuracy = np.sum(np.square(y_pred_test - y_test)) / (2 * len(y_test))

        thetas = session.run(w)
        bias   = session.run(b)

    return thetas,cost_hist,bias

In [58]:
if __name__=='__main__':
    epochs = 10
    thetas,cost,bias=run(epochs) 
    print(thetas)
    print(bias)

ValueError: Variable rnn/output_projection_wrapper/multi_rnn_cell/cell_0/basic_lstm_cell/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)


In [62]:
def build_rnn(n_words, embed_size, batch_size, lstm_size,
              num_layers, dropout, learning_rate, multiple_fc, fc_units):
    
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    #for dropout
        #embedding
    with tf.variable_scope('Embedding') as scope:
        embedding = tf.Variable(tf.random_uniform((n_words,embed_size), -1, 1))
        embed = tf.nn.embedding_lookup(embedding, inputs)
        
    # Build the RNN layers
    with tf.name_scope("RNN_layers"):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
        cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)

    # Set the initial state
        initial_state = cell.zero_state(batch_size, tf.float32)
        
     # Run the data through the RNN layers
        outputs, final_state = tf.nn.dynamic_rnn(cell,embed,initial_state=initial_state)
        
    with tf.name_scope("fully_connected"):
     # Initialize the weights and biases
        weights = tf.truncated_normal_initializer(stddev=0.1)
        biases = tf.zeros_initializer()

        dense = tf.contrib.layers.fully_connected(outputs[:, -1],
                    num_outputs = fc_units,
                    activation_fn = tf.sigmoid,
                    weights_initializer = weights,
                    biases_initializer = biases)

        dense = tf.contrib.layers.dropout(dense, keep_prob)
        # Depending on the iteration, use a second fully connected layer

        if multiple_fc == True:

            dense = tf.contrib.layers.fully_connected(dense,
                        num_outputs = fc_units,
                        activation_fn = tf.sigmoid,
                        weights_initializer = weights,
                        biases_initializer = biases)
            
            dense = tf.contrib.layers.dropout(dense, keep_prob)
            
    # Make the predictions
    with tf.name_scope('predictions'):

        predictions = tf.contrib.layers.fully_connected(dense, 
                          num_outputs = 1, 
                          activation_fn=tf.sigmoid,
                          weights_initializer = weights,
                          biases_initializer = biases)

        #tf.summary.histogram('predictions', predictions)

    # Calculate the cost
    with tf.name_scope('cost'):
        cost = tf.losses.mean_squared_error(labels, predictions)
        tf.summary.scalar('cost', cost)
        
    #return inputs,labels,keep_prob,embedding,weights,biases,predictions,cost

In [60]:
def run(model,epochs):
    
    #inputs,labels,keep_prob,embedding,weights,biases,predictions,cost=build_rnn(n_words, embed_size, batch_size, lstm_size,num_layers, dropout, learning_rate, multiple_fc, fc_units)
    # Train the model
    #optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    LOSS = np.zeros(epochs)

    with tf.Session() as sess:
        
        init = tf.global_variables_initializer()
        sess.run(init)
    
        #feed_dict = {inputs:X_train,labels:y_train}

        
        for i in range(epochs):
            state = sess.run(model.initial_state)
            feed_dict = {model.inputs: X_train,
                            model.labels: y_train[:, None],
                            model.keep_prob: dropout,
                            model.initial_state: state}

            loss_val,i = session.run([cost,optimizer],feed_dict)
            #print('loss:',loss_val.mean())  
            LOSS[i]=loss_val.mean()
        #y_pred_batch = session.run(y_pred,{x:X_train})
        
        #thetas = session.run(w)
    return LOSS

In [61]:
# The default parameters of the model

n_words = len(word_index)
embed_size = 300
batch_size = 250
lstm_size  = 128
num_layers = 2
dropout = 0.5
learning_rate = 0.001
epochs = 100
multiple_fc = False
fc_units  = 256
keep_prob = 1

model=build_rnn(n_words, embed_size, batch_size, lstm_size,num_layers, dropout, learning_rate, multiple_fc, fc_units)

if __name__=='__main__':
    epochs = 1000
    cost=run(model,epochs) 
    print('Estimated parameters: ',thetas)
    fig, ax = plt.subplots(figsize=(12,8))  
    ax.plot(np.arange(epochs), cost, 'r')  
    ax.set_xlabel('Iterations')  
    ax.set_ylabel('Cost')  
    ax.set_title('Error vs. Training Epoch') 

ValueError: Variable rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)


In [72]:
#Performance on Test-Set¶
result = model.evaluate(x_test_pad, y_test)
print("Accuracy: {0:.2%}".format(result[1]))


Accuracy: 95.30%


In [73]:
#Predicted sentiment for the first 1000 texts in the test-set.
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

#These predicted numbers fall between 0.0 and 1.0.
#We use a cutoff / threshold and say that all values above 0.5 are taken to be 1.0
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

#The true "class" for the first 1000 texts in the test-set are needed for comparison.
cls_true = np.array(y_test[0:1000])

#We can then get indices for all the texts that were incorrectly classified by comparing all the "classes" of these two arrays.
incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

#Of the 1000 texts used, how many were mis-classified?
print("Number of Mis-classified texts ", (len(incorrect)))

#Let us look at the first mis-classified text.
print("Index of first mis-classified text ", (incorrect[0]))
idx = incorrect[0]

Number of Mis-classified texts  50
Index of first mis-classified text  15


In [75]:
#Predicted and true classes for the text:

print("Predicted  label: ", (y_pred[idx]))
print("True class label: ", (cls_true[idx]))
print("                                                                      ")
print("Misclassified text: ", (X_test[idx]))

Predicted  label:  0.13240704
True class label:  1
                                                                      
Misclassified text:  Harry Potter is brilliant.


In [98]:
#Fitting our model on unlabelled text
tokens = tokenizer.texts_to_sequences(unlabeltext.ravel())
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [99]:
model.predict(tokens_pad)

array([[0.39652243],
       [0.0953111 ],
       [0.0953111 ],
       ...,
       [0.0953111 ],
       [0.0953111 ],
       [0.0953111 ]], dtype=float32)

In [44]:
#References:
#This is an in-class contest hosted by University of Michigan SI650 (Information Retrieval)
#https://www.kaggle.com/c/si650winter11/data
#https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/20_Natural_Language_Processing.ipynb