In [28]:
!pip3 install wget
import wget
wget.download("https://s3.amazonaws.com/chrisjermainebucket/text/war.txt")

100% [......................................................] 3288707 / 3288707

'war (1).txt'

In [29]:
wget.download("https://s3.amazonaws.com/chrisjermainebucket/text/william.txt")
wget.download("https://s3.amazonaws.com/chrisjermainebucket/text/Holmes.txt")

100% [........................................................] 594933 / 594933

'Holmes (1).txt'

In [7]:
import numpy as np
import tensorflow.compat.v1 as tf
import random
tf.disable_v2_behavior()

# the number of iterations to train for
numTrainingIters = 10000

# the number of hidden neurons that hold the state of the RNN
hiddenUnits = 500

# the number of classes that we are learning over
numClasses = 3

# the number of data points in a batch
batchSize = 100

# size of the sliding window
windowSize = 10

numOfChannels = 8

dataID_trained = set()

In [8]:
# this function takes a dictionary (called data) which contains 
# of (dataPointID, (classNumber, matrix)) entries.  Each matrix
# is a sequence of vectors; each vector has a one-hot-encoding of
# an ascii character, and the sequence of vectors corresponds to
# one line of text.  classNumber indicates which file the line of
# text came from.  
# 
# The argument maxSeqLen is the maximum length of a line of text
# seen so far.  fileName is the name of a file whose contents
# we want to add to data.  classNum is an indicator of the class
# we are going to associate with text from that file.  linesToUse
# tells us how many lines to sample from the file.
#
# The return val is the new maxSeqLen, as well as the new data
# dictionary with the additional lines of text added
def addToData (maxSeqLen, data, fileName, classNum, linesToUse):
    #
    # open the file and read it in
    with open(fileName) as f:
        content = f.readlines()
    #
    # sample linesToUse numbers; these will tell us what lines
    # from the text file we will use
    myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
    #
    # i is the key of the next line of text to add to the dictionary
    # i:0 --> i:linesToUse
    i = len(data)
    #
    # loop thru and add the lines of text to the dictionary
    for whichLine in myInts.flat:
        #
        # get the line and ignore it if it has nothing in it
        line = content[whichLine]
        if line.isspace () or len(line) == 0:
            continue;
        #
        # take note if this is the longest line we've seen
        if len (line) > maxSeqLen:
            maxSeqLen = len (line)
        #
        # create the matrix that will hold this line
        temp = np.zeros((len(line), 256))
        #
        # j is the character we are on
        j = 0
        # 
        # loop thru the characters
        for ch in line:
            #
            # non-ascii? ignore
            if ord(ch) >= 256:
                continue
            #
            # one hot!
            temp[j][ord(ch)] = 1
            # 
            # move onto the next character
            j = j + 1
            #
        # remember the line of text
        data[i] = (classNum, temp)
        #
        # move onto the next line
        i = i + 1
    #
    # and return the dictionary with the new data
    return (maxSeqLen, data)

In [9]:
# this function takes as input a data set encoded as a dictionary
# (same encoding as the last function) and pre-pends every line of
# text with empty characters so that each line of text is exactly
# maxSeqLen characters in size
def pad (maxSeqLen, data):
   #
   # loop thru every line of text
   for i in data:
        #
        # access the matrix and the label
        temp = data[i][1]       # data[i][1] is a len(line) * 256 matrix
        label = data[i][0]
        # 
        # get the number of chatacters in this line
        len = temp.shape[0]
        #
        # and then pad so the line is the correct length
        padding = np.zeros ((maxSeqLen - len, 256)) 
        data[i] = (label, np.transpose (np.concatenate ((padding, temp), axis = 0)))
   #
   # return the new data set
   return data

In [10]:
# this generates a new batch of training data of size batchSize from the
# list of lines of text data. This version of generateData is useful for
# an RNN because the data set x is a NumPy array with dimensions
# [batchSize, 256, maxSeqLen]; it can be unstacked into a series of
# matrices containing one-hot character encodings for each data point
# using tf.unstack(inputX, axis=2)
def generateDataRNN (maxSeqLen, data):

    #
    # randomly sample batchSize lines of text
    keys = list(data.keys())
    # myInts = np.random.random_integers(0, len(data) - 1, batchSize)          # an array of length batchSize
    myInts = np.random.choice(keys, batchSize)

    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack (data[i][1] for i in myInts.flat)
    # print(x)
    #
    # and stack all of the labels into a vector of labels
    y = np.stack (np.array((data[i][0])) for i in myInts.flat)
    # print(y)
    
    #
    # return the pair
    return (x, y)

In [11]:
# this also generates a new batch of training data, but it represents
# the data as a NumPy array with dimensions [batchSize, 256 * maxSeqLen]
# where for each data point, all characters have been appended.  Useful
# for feed-forward network training
def generateDataFeedForward (maxSeqLen, data):
    #
    # randomly sample batchSize lines of text
    keys = list(data.keys())
    # myInts = np.random.random_integers(0, len(data) - 1, batchSize)
    myInts = np.random.choice(keys, batchSize)
    
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack(data[i][1].flatten() for i in myInts.flat)
    #
    # and stack all of the labels into a vector of labels
    y = np.stack(np.array((data[i][0])) for i in myInts.flat)
    #
    # return the pair
    return (x, y)

## Task 2: Adding “Time Warping” to the RNN

In [36]:
# create the data dictionary

maxSeqLen = 0
data = {}

# load up the three data sets
(maxSeqLen, data) = addToData (maxSeqLen, data, "Holmes.txt", 0, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "war.txt", 1, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "william.txt", 2, 10000)

# pad each entry in the dictionary with empty characters as needed so
# that the sequences are all of the same length
# 裏面transpose變成 256 * maxSeqLen 的shape
data = pad (maxSeqLen, data)
allDataID = [i for i in range(len(data))]
# print(len(data))    25041
testDataID = random.sample(allDataID[0:7000], 1000) + random.sample(allDataID[10000:18000], 1000) + random.sample(allDataID[19500:], 1000)
trainDataID = list(set(allDataID) - set(testDataID))
data_train = dict()
data_test = dict()
for train_id in trainDataID:
    data_train[train_id] = data[train_id]
for test_id in testDataID:
    data_test[test_id] = data[test_id]

        
# now we build the TensorFlow computation... there are two inputs, 
# a batch of text lines and a batch of labels
inputX = tf.placeholder(tf.float32, [batchSize, 256, maxSeqLen])
inputY = tf.placeholder(tf.int32, [batchSize])

# this is the inital state of the RNN, before processing any data
initialState = tf.placeholder(tf.float32, [batchSize, hiddenUnits])

# the weight matrix that maps the inputs and hidden state to a set of values
# W is modified to include the hidden state at t-10 -> so 256 + hiddenUnits + hiddenUnits
W = tf.Variable(np.random.normal(0, 0.05, (256 + hiddenUnits + hiddenUnits, hiddenUnits)), dtype=tf.float32)

# biaes for the hidden values
b = tf.Variable(np.zeros((1, hiddenUnits)), dtype=tf.float32)

# weights and bias for the final classification
W2 = tf.Variable(np.random.normal (0, 0.05, (hiddenUnits, numClasses)),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,numClasses)), dtype=tf.float32)

# unpack the input sequences so that we have a series of matrices,
# each of which has a one-hot encoding of the current character from
# every input sequence
sequenceOfLetters = tf.unstack(inputX, axis=2)

# now we implement the forward pass
currentState = initialState
timeWarpStates = [initialState] * 10
for idx,timeTick in enumerate(sequenceOfLetters):
    # concatenate the state with the input, then compute the next state
    # Add time warpping -> append the state at t-10 as input
    inputPlusState = tf.concat([timeTick, currentState, timeWarpStates[idx%10]], 1)     # shape: batchsize * (256 + hiddenUnits + hiddenUnits)
    next_state = tf.tanh(tf.matmul(inputPlusState, W) + b)      # W shape: (256 + hiddenUnits + hiddenUnits) * hiddenUnits
    currentState = next_state
    timeWarpStates[idx%10] = next_state    # Update the previous 10 states for future time warp


# compute the set of outputs
outputs = tf.matmul(currentState, W2) + b2

predictions = tf.nn.softmax(outputs)

# compute the loss
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=inputY)
totalLoss = tf.reduce_mean(losses)  # take mean of losses

# use gradient descent to train
#trainingAlg = tf.train.GradientDescentOptimizer(0.02).minimize(totalLoss)
trainingAlg = tf.train.AdagradOptimizer(0.02).minimize(totalLoss)

# and train!!
with tf.Session() as sess:
    #
    # initialize everything
    sess.run(tf.global_variables_initializer())
    #
    # and run the training iters
    # _finalState = np.zeros((batchSize, hiddenUnits))
    for epoch in range(10000):
        # 
        # get some data
        x, y = generateDataRNN(maxSeqLen, data_train)

        #
        # do the training epoch
        _currentState = np.zeros((batchSize, hiddenUnits))
        _totalLoss, _trainingAlg, _currentState, _predictions, _outputs = sess.run(
                [totalLoss, trainingAlg, currentState, predictions, outputs],
                feed_dict={
                    inputX:x,
                    inputY:y,
                    initialState:_currentState
                })

        #
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y)):
           maxPos = -1
           maxVal = 0.0
           for j in range (numClasses):
               if maxVal < _predictions[i][j]:
                   maxVal = _predictions[i][j]
                   maxPos = j
           if maxPos == y[i]:
               numCorrect = numCorrect + 1
        #
        # print out to the screen
        print("Step", epoch, "Loss", _totalLoss, "Correct", numCorrect, "out of", batchSize)


    
    # Testing
    _avgTestLoss = 0
    totalNumCorrect = 0
    x_test, y_test = generateDataRNN(maxSeqLen, data_test)
    _currentState = np.zeros((batchSize, hiddenUnits))
    for i in range(30):
        _testLoss, _testPredictions = sess.run([totalLoss, predictions], feed_dict = {
                            inputX:x_test,
                            inputY:y_test,
                            initialState:_currentState
                            })
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y_test)):
            maxPos = -1
            maxVal = 0.0
            for j in range (numClasses):
                if maxVal < _testPredictions[i][j]:
                    maxVal = _testPredictions[i][j]
                    maxPos = j
            if maxPos == y_test[i]:
                numCorrect = numCorrect + 1
        totalNumCorrect += numCorrect
        _avgTestLoss += _testLoss
        
    _avgTestLoss /= 30
    print("Loss for 3000 randomly chosen documents is ", _avgTestLoss, " number of correct labels is ", totalNumCorrect, " out of 3000")
    


  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
2021-12-05 12:14:43.392784: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-05 12:14:43.392840: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-12-05 12:14:43.695296: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-05 12:14:43.915980: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer

Step 0 Loss 1.2628633 Correct 30 out of 100
Step 1 Loss 1.6495037 Correct 29 out of 100
Step 2 Loss 1.4240527 Correct 26 out of 100
Step 3 Loss 1.9104358 Correct 35 out of 100
Step 4 Loss 3.0538805 Correct 32 out of 100
Step 5 Loss 5.4023643 Correct 36 out of 100
Step 6 Loss 1.896705 Correct 29 out of 100
Step 7 Loss 1.3688481 Correct 36 out of 100
Step 8 Loss 2.712275 Correct 26 out of 100
Step 9 Loss 3.1859426 Correct 34 out of 100
Step 10 Loss 2.9094408 Correct 42 out of 100
Step 11 Loss 1.5881045 Correct 36 out of 100
Step 12 Loss 2.4083629 Correct 33 out of 100
Step 13 Loss 3.7269158 Correct 38 out of 100
Step 14 Loss 2.9805605 Correct 21 out of 100
Step 15 Loss 1.6653514 Correct 32 out of 100
Step 16 Loss 2.877238 Correct 29 out of 100
Step 17 Loss 3.437582 Correct 26 out of 100
Step 18 Loss 1.3295876 Correct 39 out of 100
Step 19 Loss 2.2800493 Correct 38 out of 100
Step 20 Loss 1.2402952 Correct 32 out of 100
Step 21 Loss 1.8895581 Correct 41 out of 100
Step 22 Loss 1.2401823 C

2021-12-05 12:24:55.176513: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Loss for 3000 randomly chosen documents is  0.3361250162124634  number of correct labels is  2730  out of 3000


## Task 3: Implementing a Feed-Forward Network
- change the code so that it no longer implements an RNN, but instead it implements a simple feed-forward network with one hidden layer
- For the RNN, our line of text representation was a matrix (a sequence of vectors, where each vector was a one-hot encoding of a character). 
- Now, our text representation will be a single vector, where each vector has all of the vectors encoding each of the characters, appended end-on-end. Note that I’ve already supplied you with a function that supplies minibatches of data in this format, so you can just use my function to create batches of training data for learning. All you need to do is to ﬁgure out how to modify the network to make use of this function.
- this feed-forward network has even higher-accuracy than the RNN with time-warping. Probably better than a human could do!

In [20]:
hiddenUnits = 1000 # Reset number of hiddenUnit to 1000 from 500(for time warp)

# create the data dictionary
maxSeqLen = 0
data = {}

# load up the three data sets
(maxSeqLen, data) = addToData (maxSeqLen, data, "Holmes.txt", 0, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "war.txt", 1, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "william.txt", 2, 10000)

# pad each entry in the dictionary with empty characters as needed so
# that the sequences are all of the same length
# 裏面transpose變成 256 * maxSeqLen 的shape
data = pad (maxSeqLen, data)
allDataID = [i for i in range(len(data))]
# print(len(data))    25041
testDataID = random.sample(allDataID[0:8000], 1000) + random.sample(allDataID[10000:18000], 1000) + random.sample(allDataID[19500:], 1000)
trainDataID = list(set(allDataID) - set(testDataID))
data_train = dict()
data_test = dict()
for train_id in trainDataID:
    data_train[train_id] = data[train_id]
for test_id in testDataID:
    data_test[test_id] = data[test_id]

        
# now we build the TensorFlow computation... there are two inputs, 
# a batch of text lines and a batch of labels
inputX = tf.placeholder(tf.float32, [batchSize, 256 * maxSeqLen])
inputY = tf.placeholder(tf.int32, [batchSize])

# this is the inital state of the RNN, before processing any data
initialState = tf.placeholder(tf.float32, [batchSize, hiddenUnits])

# the weight matrix that maps the inputs and hidden state to a set of values
W = tf.Variable(np.random.normal(0, 0.05, (256 * maxSeqLen + hiddenUnits, hiddenUnits)), dtype=tf.float32)
print(f"shape of W: {W.shape}")

# biaes for the hidden values
b = tf.Variable(np.zeros((1, hiddenUnits)), dtype=tf.float32)

# weights and bias for the final classification
W2 = tf.Variable(np.random.normal (0, 0.05, (hiddenUnits, numClasses)),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,numClasses)), dtype=tf.float32)

# unpack the input sequences so that we have a series of matrices,
# each of which has a one-hot encoding of the current character from
# every input sequence
# sequenceOfLetters = tf.unstack(inputX, axis=2)

# now we implement the forward pass
currentState = initialState
inputPlusState = tf.concat([inputX, currentState], 1)     # shape: batchsize * (256 + hiddenUnits)
print(f"shape of inputPlusState: {inputPlusState.shape}")
next_state = tf.tanh(tf.matmul(inputPlusState, W) + b)      # W shape: (256 + hiddenUnits + hiddenUnits) * hiddenUnits
currentState = next_state
# for idx,timeTick in enumerate(sequenceOfLetters):
#     # concatenate the state with the input, then compute the next state
#     # Add time warpping -> append the state at t-10 as input
#     inputPlusState = tf.concat([timeTick, currentState], 1)     # shape: batchsize * (256 + hiddenUnits + hiddenUnits)
#     next_state = tf.tanh(tf.matmul(inputPlusState, W) + b)      # W shape: (256 + hiddenUnits + hiddenUnits) * hiddenUnits
#     currentState = next_state


# compute the set of outputs
outputs = tf.matmul(currentState, W2) + b2

predictions = tf.nn.softmax(outputs)

# compute the loss
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=inputY)
totalLoss = tf.reduce_mean(losses)  # take mean of losses

# use gradient descent to train
#trainingAlg = tf.train.GradientDescentOptimizer(0.02).minimize(totalLoss)
trainingAlg = tf.train.AdagradOptimizer(0.02).minimize(totalLoss)

# and train!!
with tf.Session() as sess:
    #
    # initialize everything
    sess.run(tf.global_variables_initializer())
    #
    # and run the training iters
    # _finalState = np.zeros((batchSize, hiddenUnits))
    for epoch in range(1000):
        # 
        # get some data
        x, y = generateDataFeedForward(maxSeqLen, data_train)

        #
        # do the training epoch
        _currentState = np.zeros((batchSize, hiddenUnits))
        _totalLoss, _trainingAlg, _currentState, _predictions, _outputs = sess.run(
                [totalLoss, trainingAlg, currentState, predictions, outputs],
                feed_dict={
                    inputX:x,
                    inputY:y,
                    initialState:_currentState
                })

        #
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y)):
           maxPos = -1
           maxVal = 0.0
           for j in range (numClasses):
               if maxVal < _predictions[i][j]:
                   maxVal = _predictions[i][j]
                   maxPos = j
           if maxPos == y[i]:
               numCorrect = numCorrect + 1
        #
        # print out to the screen
        print("Step", epoch, "Loss", _totalLoss, "Correct", numCorrect, "out of", batchSize)


    
    # Testing
    _avgTestLoss = 0
    totalNumCorrect = 0
    x_test, y_test = generateDataFeedForward(maxSeqLen, data_test)
    _currentState = np.zeros((batchSize, hiddenUnits))
    for i in range(30):
        _testLoss, _testPredictions = sess.run([totalLoss, predictions], feed_dict = {
                            inputX:x_test,
                            inputY:y_test,
                            initialState:_currentState
                            })
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y_test)):
            maxPos = -1
            maxVal = 0.0
            for j in range (numClasses):
                if maxVal < _testPredictions[i][j]:
                    maxVal = _testPredictions[i][j]
                    maxPos = j
            if maxPos == y_test[i]:
                numCorrect = numCorrect + 1
        totalNumCorrect += numCorrect
        _avgTestLoss += _testLoss
        
    _avgTestLoss /= 30
    print("Loss for 3000 randomly chosen documents is ", _avgTestLoss, " number of correct labels is ", totalNumCorrect, " out of 3000")
    


  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)


shape of W: (20968, 1000)
shape of inputPlusState: (100, 20968)


2021-12-05 14:51:38.918137: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-05 14:51:38.918165: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-12-05 14:51:39.490422: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-05 14:51:45.252339: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Step 0 Loss 1.2389629 Correct 31 out of 100
Step 1 Loss 1.1556064 Correct 41 out of 100
Step 2 Loss 1.1692396 Correct 39 out of 100
Step 3 Loss 1.2006941 Correct 35 out of 100
Step 4 Loss 1.150702 Correct 35 out of 100
Step 5 Loss 1.0554899 Correct 43 out of 100
Step 6 Loss 1.1095992 Correct 40 out of 100
Step 7 Loss 1.1090609 Correct 49 out of 100
Step 8 Loss 1.1089151 Correct 39 out of 100
Step 9 Loss 1.113307 Correct 43 out of 100
Step 10 Loss 1.0816805 Correct 47 out of 100
Step 11 Loss 1.0842494 Correct 46 out of 100
Step 12 Loss 0.9850821 Correct 53 out of 100
Step 13 Loss 0.9981682 Correct 53 out of 100
Step 14 Loss 0.8927123 Correct 65 out of 100
Step 15 Loss 0.9926849 Correct 50 out of 100
Step 16 Loss 1.0413277 Correct 46 out of 100
Step 17 Loss 0.9572999 Correct 55 out of 100
Step 18 Loss 1.0111554 Correct 47 out of 100
Step 19 Loss 0.96373504 Correct 53 out of 100
Step 20 Loss 0.89880806 Correct 54 out of 100
Step 21 Loss 0.9376332 Correct 55 out of 100
Step 22 Loss 0.94275

2021-12-05 14:52:20.509453: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Loss for 3000 randomly chosen documents is  0.25178956985473633  number of correct labels is  2760  out of 3000


## Task 4: Modifying the “Time Warping” RNN to Use a Convolution

 - Rather than processing one character at a time, you will process ten characters at a time. That is, consider the sentence “This is my cool string”. You will process not the sequence of characters ‘T’, ‘h’, ‘i’, etc., but instead you will process the sequence of strings “This is my”, “his is my ”, “is is my c”, “s is my co”, and so on.
 - The idea in “convolutional ﬁltering” is that you process each sequence by mapping it down to a single value (if you view the sequence of 10 characters as a 256 × 10-dimensional vector, you can map it down to a single value using a dot product with a 256 × 10-dimensional vector (this is a convolutional ﬁlter; “convolutional” refers to the sliding window). If you multiply with a matrix that contains eight, 256 × 10-dimensional vectors, you process the window with eight ﬁlers and obtains eight values. It is this resulting vector of eight values that is input into the RNN at each time tick, rather than the single character.

In [13]:
# create the data dictionary

maxSeqLen = 0
data = {}

# load up the three data sets
(maxSeqLen, data) = addToData (maxSeqLen, data, "Holmes.txt", 0, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "war.txt", 1, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "william.txt", 2, 10000)

# pad each entry in the dictionary with empty characters as needed so
# that the sequences are all of the same length
# 裏面transpose變成 256 * maxSeqLen 的shape
data = pad (maxSeqLen, data)
allDataID = [i for i in range(len(data))]
# print(len(data))    25041
testDataID = random.sample(allDataID[0:7000], 1000) + random.sample(allDataID[10000:18000], 1000) + random.sample(allDataID[19500:], 1000)
trainDataID = list(set(allDataID) - set(testDataID))
data_train = dict()
data_test = dict()
for train_id in trainDataID:
    data_train[train_id] = data[train_id]
for test_id in testDataID:
    data_test[test_id] = data[test_id]

        
# now we build the TensorFlow computation... there are two inputs, 
# a batch of text lines and a batch of labels
inputX = tf.placeholder(tf.float32, [batchSize, 256, maxSeqLen])
inputY = tf.placeholder(tf.int32, [batchSize])

# this is the inital state of the RNN, before processing any data
initialState = tf.placeholder(tf.float32, [batchSize, hiddenUnits])  # (100, 500)

# the weight matrix that maps the inputs and hidden state to a set of values
# W is modified to include the hidden state at t-10 -> so 256 + hiddenUnits + hiddenUnits
W = tf.Variable(np.random.normal(0, 0.05, (8 + hiddenUnits + hiddenUnits, hiddenUnits)), dtype=tf.float32)

# weight matrix for 8 channels of filter
W_filters = [tf.Variable(np.random.normal(0, 0.05, (256, 10)), dtype=tf.float32) for i in range(numOfChannels)]
for i in range(numOfChannels):
    W_filters[i] = tf.stack([W_filters[i]]*100)


# biaes for the hidden values
b = tf.Variable(np.zeros((1, hiddenUnits)), dtype=tf.float32)

# weights and bias for the final classification
W2 = tf.Variable(np.random.normal (0, 0.05, (hiddenUnits, numClasses)),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,numClasses)), dtype=tf.float32)

# unpack the input sequences so that we have a series of matrices,
# each of which has a one-hot encoding of the current character from
# every input sequence
sequenceOfLetters = tf.unstack(inputX, axis=2)      # sequenceOfLetters[0].shape = (100, 256)

# now we implement the forward pass
currentState = initialState
timeWarpStates = [initialState] * 10
for idx,timeTick in enumerate(sequenceOfLetters):
    if idx + 10 < len(sequenceOfLetters):
        currentWindow = sequenceOfLetters[idx:idx+10]       # [(100, 256), ..., (100, 256)]
        currentWindow = tf.stack(currentWindow, axis=2)     # shape of currentWindow: (100, 256, 10)
        dotResult = [0] * numOfChannels
        for channel in range(numOfChannels):
            c = tf.reduce_sum( tf.multiply( currentWindow, W_filters[channel]), axis=[1,2],keep_dims=False )
            dotResult[channel] = c

        dotResult = tf.stack(dotResult, axis=1)
        # concatenate the state with the input, then compute the next state
        # Add time warpping -> append the state at t-10 as input
        inputPlusState = tf.concat([dotResult, currentState, timeWarpStates[idx%10]], 1)     # shape: batchsize * (256 + hiddenUnits + hiddenUnits)
        next_state = tf.tanh(tf.matmul(inputPlusState, W) + b)      # W shape: (256 + hiddenUnits + hiddenUnits) * hiddenUnits
        currentState = next_state
        timeWarpStates[idx%10] = next_state    # Update the previous 10 states for future time warp


# compute the set of outputs
outputs = tf.matmul(currentState, W2) + b2

predictions = tf.nn.softmax(outputs)

# compute the loss
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=inputY)
totalLoss = tf.reduce_mean(losses)  # take mean of losses

# use gradient descent to train
#trainingAlg = tf.train.GradientDescentOptimizer(0.02).minimize(totalLoss)
trainingAlg = tf.train.AdagradOptimizer(0.02).minimize(totalLoss)

# and train!!
with tf.Session() as sess:
    #
    # initialize everything
    sess.run(tf.global_variables_initializer())
    #
    # and run the training iters
    # _finalState = np.zeros((batchSize, hiddenUnits))
    for epoch in range(10000):
        # 
        # get some data
        x, y = generateDataRNN(maxSeqLen, data_train)

        #
        # do the training epoch
        _currentState = np.zeros((batchSize, hiddenUnits))
        _totalLoss, _trainingAlg, _currentState, _predictions, _outputs = sess.run(
                [totalLoss, trainingAlg, currentState, predictions, outputs],
                feed_dict={
                    inputX:x,
                    inputY:y,
                    initialState:_currentState
                })

        #
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y)):
           maxPos = -1
           maxVal = 0.0
           for j in range (numClasses):
               if maxVal < _predictions[i][j]:
                   maxVal = _predictions[i][j]
                   maxPos = j
           if maxPos == y[i]:
               numCorrect = numCorrect + 1
        #
        # print out to the screen
        print("Step", epoch, "Loss", _totalLoss, "Correct", numCorrect, "out of", batchSize)


    
    # Testing
    _avgTestLoss = 0
    totalNumCorrect = 0
    x_test, y_test = generateDataRNN(maxSeqLen, data_test)
    _currentState = np.zeros((batchSize, hiddenUnits))
    for i in range(30):
        _testLoss, _testPredictions = sess.run([totalLoss, predictions], feed_dict = {
                            inputX:x_test,
                            inputY:y_test,
                            initialState:_currentState
                            })
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y_test)):
            maxPos = -1
            maxVal = 0.0
            for j in range (numClasses):
                if maxVal < _testPredictions[i][j]:
                    maxVal = _testPredictions[i][j]
                    maxPos = j
            if maxPos == y_test[i]:
                numCorrect = numCorrect + 1
        totalNumCorrect += numCorrect
        _avgTestLoss += _testLoss
        
    _avgTestLoss /= 30
    print("Loss for 3000 randomly chosen documents is ", _avgTestLoss, " number of correct labels is ", totalNumCorrect, " out of 3000")
    


  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
  myInts = np.random.random_integers(0, len(content) - 1, linesToUse)
2021-12-06 20:56:23.267598: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-06 20:56:23.267625: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-12-06 20:56:23.524697: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-06 20:56:23.686950: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer

Step 0 Loss 1.2138585 Correct 37 out of 100
Step 1 Loss 1.392019 Correct 29 out of 100
Step 2 Loss 2.338253 Correct 40 out of 100
Step 3 Loss 1.4862965 Correct 31 out of 100
Step 4 Loss 1.9058383 Correct 34 out of 100
Step 5 Loss 5.2703404 Correct 35 out of 100
Step 6 Loss 3.7493734 Correct 41 out of 100
Step 7 Loss 2.436378 Correct 35 out of 100
Step 8 Loss 4.108468 Correct 29 out of 100
Step 9 Loss 2.5080469 Correct 37 out of 100
Step 10 Loss 3.4213953 Correct 32 out of 100
Step 11 Loss 2.4537735 Correct 29 out of 100
Step 12 Loss 1.8301388 Correct 23 out of 100
Step 13 Loss 1.8656504 Correct 44 out of 100
Step 14 Loss 1.1835592 Correct 41 out of 100
Step 15 Loss 1.1118584 Correct 34 out of 100
Step 16 Loss 1.2389128 Correct 39 out of 100
Step 17 Loss 1.4700769 Correct 29 out of 100
Step 18 Loss 1.2581239 Correct 42 out of 100
Step 19 Loss 1.1120366 Correct 39 out of 100
Step 20 Loss 1.1023881 Correct 39 out of 100
Step 21 Loss 1.2419515 Correct 38 out of 100
Step 22 Loss 1.3946911 C

2021-12-06 21:33:12.865154: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Loss for 3000 randomly chosen documents is  0.3585411608219147  number of correct labels is  2700  out of 3000
