In [3]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [4]:
from scipy.spatial.distance import cdist

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [5]:
df = pd.read_csv("training.txt",sep="	", header=None)
unlabeltext  = pd.read_fwf('testdata.txt')

#Inputs and Output labels
X = np.array(df[1])
y = np.array(df[0])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .20,random_state = 42)

unlabeltext.fillna('None',inplace=True)
#unlabeltext.replace('None', np.nan, inplace=True)
unlabeltext  = np.array(unlabeltext)

print(X)

['The Da Vinci Code book is just awesome.'
 "this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this."
 'i liked the Da Vinci Code a lot.' ...
 'As I sit here, watching the MTV Movie Awards, I am reminded of how much I despised the movie Brokeback Mountain.'
 'Ok brokeback mountain is such a horrible movie.'
 'Oh, and Brokeback Mountain was a terrible movie.']


In [6]:
#cleaning 
#and Tokenzing the data

num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X)
x_train_tokens = tokenizer.texts_to_sequences(X_train)
x_test_tokens  = tokenizer.texts_to_sequences(X_test)

#print(tokenizer.word_index)
print("Train-set:  ", (X_train[0]))
print("                                                                      ")
print("Train-tokenized-set:  ", (x_train_tokens[0]))

Train-set:   I really like The Da Vinci Code.
                                                                      
Train-tokenized-set:   [1, 32, 18, 2, 7, 6, 8]


In [7]:
#Padding and Truncating Data¶
#The Recurrent Neural Network can take sequences of arbitrary length as input

#First we count the number of tokens in all the sequences in the data-set.
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

print("The average number of tokens in a sequence is:  ", (np.mean(num_tokens)))
print("The maximum number of tokens in a sequence is:  ", (np.max(num_tokens)))

#The max number of tokens we will allow is set to the average plus 2 standard deviations.
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print("The max number of tokens we will allow is set to the average plus 2 sd  ", (max_tokens))
print("This covers about 99% of the data-set:  ", (np.sum(num_tokens < max_tokens) / len(num_tokens)))

The average number of tokens in a sequence is:   11.072997976293726
The maximum number of tokens in a sequence is:   933
The max number of tokens we will allow is set to the average plus 2 sd   40
This covers about 99% of the data-set:   0.9985544955189362


In [8]:
#padding or truncating the sequences that have a different length, 
#we need to determine if we want to do this padding or truncating 'pre' or 'post'
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
x_test_pad  = pad_sequences(x_test_tokens,  maxlen=max_tokens,padding=pad, truncating=pad)

#We have now transformed the data into one big matrix of integers (tokens) with this shape:
print("The train-set is transformed into one big matrix of integers (tokens)", (x_train_pad.shape))
print("The test -set is transformed into one big matrix of integers (tokens)", (x_test_pad.shape))

#Padding result
print("                                                                      ")
print("Tokenized training data", (np.array(x_train_tokens[0])))
print("                                                                      ")
print("Padded    training data", (x_train_pad[0]))

The train-set is transformed into one big matrix of integers (tokens) (5534, 40)
The test -set is transformed into one big matrix of integers (tokens) (1384, 40)
                                                                      
Tokenized training data [ 1 32 18  2  7  6  8]
                                                                      
Padded    training data [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  1 32 18  2  7  6  8]


In [9]:
#Creating mini-batches

def get_batches(x, y, batch_size):

    '''Create the batches for the training and validation data'''
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

def get_test_batches(x, batch_size):

    '''Create the batches for the testing data'''
    n_batches = len(x)//batch_size
    x = x[:n_batches*batch_size]

    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size]

In [10]:
def model():
    
    n_words = 10000
    embed_size = 300
    lstm_size  = 128
    num_layers = 2
    fc_units  = 256
    keep_prob = 1
    learning_rate = 0.01
    batch_size=100

    x = tf.placeholder(tf.int32, [None, None], name='x')
    y = tf.placeholder(tf.int32, [None, None], name='y')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    weights = tf.truncated_normal_initializer(stddev=0.1)
    biases  = tf.zeros_initializer()
        
    embedding = tf.Variable(tf.random_uniform((n_words,embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, x)
    
    #lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size,forget_bias=1.0)
    #drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    #cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
    #cell_wrapped = tf.contrib.rnn.OutputProjectionWrapper(cell, output_size=1)

    def lstm():
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        return tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    
    cell = tf.contrib.rnn.MultiRNNCell([lstm() for _ in range(num_layers)])
    
    initial_state = cell.zero_state(batch_size,tf.float32)
    
    outputs, final_state = tf.nn.dynamic_rnn(cell,embed,initial_state= initial_state)

    dense = tf.contrib.layers.fully_connected(outputs[:,-1],
                    num_outputs = fc_units,
                    activation_fn = tf.sigmoid,
                    weights_initializer = weights,
                    biases_initializer = biases)

    dense = tf.contrib.layers.dropout(dense, keep_prob)
    
    predictions = tf.contrib.layers.fully_connected(dense, 
                          num_outputs = 1, 
                          activation_fn = tf.sigmoid,
                          weights_initializer = weights,
                          biases_initializer = biases)
    #predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)

    
    cost = tf.losses.mean_squared_error(y, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
        
    return cost,weights,biases,x,y,predictions,optimizer

In [13]:
def run(epochs):
    
    loss,w,b,x,y,y_pred,optimizer = model()
    cost_hist = np.empty(shape=[1],dtype = float)

    with tf.Session() as session:
        
        init = tf.global_variables_initializer()
        session.run(init)
                
        for i in range(epochs):
            #feed = {x:x_train_pad,y:y_train}

            for ii, (x, y) in enumerate(get_batches(x_train_pad, y_train, 100), 1):
                feed = {x: x,
                    y: y[:, None]}
                
            session.run(optimizer,feed)
            
            cost = session.run(loss,feed_dict)
            cost_hist = np.append(cost_hist,cost)

            #y_pred_test = session.run(predictions,{x:X_test})
            #accuracy = np.sum(np.square(y_pred_test - y_test)) / (2 * len(y_test))

        thetas = session.run(weights)
        bias   = session.run(biases)

    return thetas,cost_hist,bias

In [14]:
if __name__=='__main__':
    epochs = 10
    thetas,cost,bias=run(epochs) 

ValueError: Variable rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "c:\users\harpreet singh\appdata\local\programs\python\python36-64\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
