In [1]:
import numpy as np
import tensorflow as tf


In [2]:
with open ("F:/ML/LSTM_Next_word_Prediction/RNN.txt" , 'r', encoding='utf-8') as file:
    rnn=file.read()


In [3]:
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

In [4]:
tokenizer=tf.keras.preprocessing.text.Tokenizer(filters=filters)
#A TensorFlow tokenizer is a tool used to split text into smaller units called tokens
tokenizer.fit_on_texts([rnn])

In [5]:
tokenizer.get_config() # Shows word counts, indices, etc.

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 1,
 'word_counts': '{"recurrent": 2, "neural": 2, "networks": 1, "rnns": 2, "are": 2, "a": 5, "type": 1, "of": 5, "network": 4, "designed": 1, "to": 8, "process": 1, "sequential": 1, "data": 2, "such": 1, "as": 4, "text": 1, "or": 1, "time": 2, "series": 1, "information": 4, "in": 4, "an": 1, "rnn": 2, "each": 1, "output": 1, "depends": 1, "not": 1, "just": 1, "on": 3, "the": 11, "current": 1, "input": 1, "but": 1, "also": 1, "previous": 1, "hidden": 1, "state": 1, "which": 1, "allows": 1, "retain": 3, "some": 1, "memory": 3, "earlier": 3, "inputs": 3, "however": 1, "standard": 1, "suffer": 1, "from": 4, "limitation": 2, "known": 1, "short": 2, "term": 4, "this": 4, "means": 1, "that": 1, "while": 1, "they": 3, "can": 1, "remember": 1, "recent": 1, "reasonably": 1, "well": 1, "struggle": 1, "much": 1, "sequence": 2, "becomes

In [6]:
tokenizer.word_index # Returns dictionary of words & their indices

{'the': 1,
 'to': 2,
 'a': 3,
 'of': 4,
 'network': 5,
 'as': 6,
 'information': 7,
 'in': 8,
 'from': 9,
 'term': 10,
 'this': 11,
 'on': 12,
 'retain': 13,
 'memory': 14,
 'earlier': 15,
 'inputs': 16,
 'they': 17,
 'long': 18,
 'and': 19,
 'recurrent': 20,
 'neural': 21,
 'rnns': 22,
 'are': 23,
 'data': 24,
 'time': 25,
 'rnn': 26,
 'limitation': 27,
 'short': 28,
 'sequence': 29,
 'problem': 30,
 'architectures': 31,
 'networks': 32,
 'type': 33,
 'designed': 34,
 'process': 35,
 'sequential': 36,
 'such': 37,
 'text': 38,
 'or': 39,
 'series': 40,
 'an': 41,
 'each': 42,
 'output': 43,
 'depends': 44,
 'not': 45,
 'just': 46,
 'current': 47,
 'input': 48,
 'but': 49,
 'also': 50,
 'previous': 51,
 'hidden': 52,
 'state': 53,
 'which': 54,
 'allows': 55,
 'some': 56,
 'however': 57,
 'standard': 58,
 'suffer': 59,
 'known': 60,
 'means': 61,
 'that': 62,
 'while': 63,
 'can': 64,
 'remember': 65,
 'recent': 66,
 'reasonably': 67,
 'well': 68,
 'struggle': 69,
 'much': 70,
 'become

In [7]:
input_sequences=[]
for sentence in rnn.split('\n'):   # Split text by lines
    #print(sentence)
    token_list=tokenizer.texts_to_sequences([sentence])[0] # Convert words to numbers
    #print(token_list)
    for i in range (1, len(token_list)):
        sequence=token_list[:i+1]    # Slice sequence (e.g., [1, 2], [1, 2, 3], ...)
        input_sequences.append(sequence) 

In [8]:
input_sequences[:5] 

[[20, 21],
 [20, 21, 32],
 [20, 21, 32, 22],
 [20, 21, 32, 22, 23],
 [20, 21, 32, 22, 23, 3]]

In [9]:
max_length=max([len(input_sequences) for input_sequences in input_sequences])
max_length

33

In [10]:
#Add padding to make all in same length
input_sequences=np.array(tf.keras.preprocessing.sequence.pad_sequences(input_sequences,maxlen=max_length, padding='pre'))


In [11]:
input_sequences[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 20, 21, 32])

In [12]:
# distinguish the fetures and labels
x=input_sequences[:,:-1]
y=input_sequences[:,-1]

In [13]:
x[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 20])

In [14]:
y

array([ 21,  32,  22,  23,   3,  33,   4,  21,   5,  34,   2,  35,  36,
        24,  37,   6,  38,  39,  25,  40,   7,  41,  26,  42,  43,  44,
        45,  46,  12,   1,  47,  48,  49,  50,  12,   1,  51,  52,  53,
        54,  55,   1,   5,   2,  13,  56,  14,   4,  15,  16,  58,  22,
        59,   9,   3,  27,  60,   6,  28,  10,  14,  61,  62,  63,  17,
        64,  65,  66,  16,  67,  68,  17,  69,   2,  13,   7,   9,  70,
        15,   8,   1,  29,  27,  71,   3,  30,  72,  18,  10,  73,  23,
        74,  75,  76,   1,  77,   4,   3,  78,   2,  79,  80,   1,  81,
        82,  83,   4,  11,  84,  85,   8,   1,  86,  87,  30,  88,  89,
        90,   8,  91,  92,  93,  94,   6,  17,  95,  96,  97,  98,  25,
        99, 100, 101, 102, 103,   9, 104,   3, 105,   7,   9,  15,  16,
       106, 107,  19,   1,   5, 108,   2, 109,  18,  10, 110, 111,  11,
       112,  26,  31, 113,  18,  28,  10,  14, 114,  19, 115,  20, 116,
       117, 118, 119,  31, 121, 122, 123,   2, 124,   1, 125,   

In [15]:
no_of_classes=len(tokenizer.word_index)+1
no_of_classes

137

In [16]:
y=np.array(tf.keras.utils.to_categorical(y,num_classes=no_of_classes))
y[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [17]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=no_of_classes,  
                             output_dim=80, 
                             input_length=max_length-1),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(no_of_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# To properly build the model, we can either:
# 1. Call model.build() with input shape
model.build(input_shape=(None, max_length-1))




In [18]:
model.summary()

In [19]:
history=model.fit(x,y,epochs=100)

Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.0088 - loss: 4.9198
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0688 - loss: 4.9048
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0485 - loss: 4.8851
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0592 - loss: 4.8063
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0426 - loss: 4.7118
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0806 - loss: 4.6180
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0580 - loss: 4.5358
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0725 - loss: 4.4584
Epoch 9/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [20]:
input_text="RNN and"
token_list=tokenizer.texts_to_sequences([input_text])[0]
print(token_list)

[26, 19]


In [21]:
token_list=tf.keras.preprocessing.sequence.pad_sequences(input_sequences,maxlen=max_length-1, padding='pre')
print(token_list)


[[  0   0   0 ...   0  20  21]
 [  0   0   0 ...  20  21  32]
 [  0   0   0 ...  21  32  22]
 ...
 [  0   0   0 ...  12 134  29]
 [  0   0   0 ... 134  29 135]
 [  0   0   0 ...  29 135 136]]


In [22]:
predicted=np.argmax(model.predict(token_list), axis=-1)
print(predicted)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[ 32  22  23   3  33   4  21   5  34   2  35  36  24  37   6  38  39  25
  40   7   7  26  42  43  44  45  46  12   1  47  48  49  50  12   1  51
  52  53  54  55   1   5   2  13  56  14   4  15  16  16  22  59   9   3
  27  60   6  28  10  14  14  62  63  17  64  65  66  16  67  68  17  69
   2  13   7   9  70  15   8   1  29  29   3   3  30  72  18  10  73  23
  74  75  76   1  77   4   3  78   2  79  80   1  81  82  82   4  11  84
  85   8   1  86  87  30  88  89  90   8  91  92  93  94   6  17  95  96
  97  98  25  99 100 101 102 103   9 104 104 105   7   9  15  16 106 107
  19   1   5 108   2 109  18  10 110 110  11 112  26  31 113  18  28  10
  14 114  19 115  20 116 117 118 119 119 121 122 123   2 124   1 125   4
   7 126   1   5   2  13 127  24 128 129 130 130 132 133  12 134  29 135
 136 136]


In [23]:
for word, index in tokenizer.word_index.items():
    if index == predicted:
        print(word)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()