### Impoting Libraries

In [10]:
import numpy as np 
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense

### Dataset

In [13]:
with open('../dataset/younus.txt', 'r', encoding='utf-8') as myText: 
    dt=myText.read()

In [14]:
dt

'Yunus identifies as a Muslim and has expressed the importance that salah and Ishq-e-Muhammadi holds to him in his personal life.[24] His father, Haji Muhammad Dula Mia Saudagar, completed Hajj three times and was a disciple of two prominent Sufis of Chittagong. Yunus continues to actively display a normative orthodox Sunnite theological creed, whilst rejecting superstition.[145] He encourages the public to engage in Dua directly to Allah,[146][147] whom Yunus publicly recognises as the supreme source of assistance and support,[148] and as the master of Divine Decree.[149] Yunus has also referred to the Qur\'an as the "guide for mankind" and acknowledged the concept of ummah in his public speeches.[145]\n\nIn 1967, while Yunus attended Vanderbilt University, he met Vera Forostenko, a student of Russian literature at Vanderbilt University and daughter of Russian immigrants to Trenton, New Jersey, United States. They were married in 1970.[18][24] Yunus\'s marriage with Vera ended within 

In [16]:
mytokenizer=Tokenizer() 
mytokenizer.fit_on_texts([dt])
total_words=len(mytokenizer.word_index)+1

In [17]:
mytokenizer.word_index

{'in': 1,
 'of': 2,
 'a': 3,
 'the': 4,
 'yunus': 5,
 'and': 6,
 'to': 7,
 'as': 8,
 'was': 9,
 '24': 10,
 'university': 11,
 'his': 12,
 'at': 13,
 'muhammad': 14,
 'vera': 15,
 'new': 16,
 'physics': 17,
 'has': 18,
 'that': 19,
 'chittagong': 20,
 '145': 21,
 'he': 22,
 'public': 23,
 'for': 24,
 'vanderbilt': 25,
 'russian': 26,
 'daughter': 27,
 'jersey': 28,
 'married': 29,
 '18': 30,
 "yunus's": 31,
 'their': 32,
 'baby': 33,
 'monica': 34,
 'bangladesh': 35,
 'later': 36,
 'professor': 37,
 'brother': 38,
 'education': 39,
 'science': 40,
 'identifies': 41,
 'muslim': 42,
 'expressed': 43,
 'importance': 44,
 'salah': 45,
 'ishq': 46,
 'e': 47,
 'muhammadi': 48,
 'holds': 49,
 'him': 50,
 'personal': 51,
 'life': 52,
 'father': 53,
 'haji': 54,
 'dula': 55,
 'mia': 56,
 'saudagar': 57,
 'completed': 58,
 'hajj': 59,
 'three': 60,
 'times': 61,
 'disciple': 62,
 'two': 63,
 'prominent': 64,
 'sufis': 65,
 'continues': 66,
 'actively': 67,
 'display': 68,
 'normative': 69,
 'orth

In [30]:
my_input_sequences = []

for line in dt.split('\n'):   
    # print(line)
    token_list = mytokenizer.texts_to_sequences([line])[0] 
    # print(token_list) 
    for i in range(1,len(token_list)): 
        my_n_gram_sequence = token_list[:i+1]
        my_input_sequences.append(my_n_gram_sequence) 
        print(my_n_gram_sequence)


[5, 41]
[5, 41, 8]
[5, 41, 8, 3]
[5, 41, 8, 3, 42]
[5, 41, 8, 3, 42, 6]
[5, 41, 8, 3, 42, 6, 18]
[5, 41, 8, 3, 42, 6, 18, 43]
[5, 41, 8, 3, 42, 6, 18, 43, 4]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47, 48]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47, 48, 49]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47, 48, 49, 7]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47, 48, 49, 7, 50]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47, 48, 49, 7, 50, 1]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47, 48, 49, 7, 50, 1, 12]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47, 48, 49, 7, 50, 1, 12, 51]
[5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 19, 45, 6, 46, 47, 48, 49, 7, 50, 1, 12, 51,

In [35]:
max_sequence_len = max([len(seq) for seq in my_input_sequence])
input_sequences = np.array(pad_sequences(my_input_sequence, maxlen=max_sequence_len, padding='pre'))

In [47]:
input_sequences[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  5, 41], dtype=int32)

# X=input_sequences[:, :-1] 
y=input_sequences[:,-1]

In [44]:
X[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5], dtype=int32)

In [50]:
y

array([ 41,   8,   3,  42,   6,  18,  43,   4,  44,  19,  45,   6,  46,
        47,  48,  49,   7,  50,   1,  12,  51,  52,  10,  12,  53,  54,
        14,  55,  56,  57,  58,  59,  60,  61,   6,   9,   3,  62,   2,
        63,  64,  65,   2,  20,   5,  66,   7,  67,  68,   3,  69,  70,
        71,  72,  73,  74,  75,  76,  21,  22,  77,   4,  23,   7,  78,
         1,  79,  80,   7,  81,  82,  83,  84,   5,  85,  86,   8,   4,
        87,  88,   2,  89,   6,  90,  91,   6,   8,   4,  92,   2,  93,
        94,  95,   5,  18,  96,  97,   7,   4,  98,   8,   4,  99,  24,
       100,   6, 101,   4, 102,   2, 103,   1,  12,  23, 104,  21, 105,
       106,   5, 107,  25,  11,  22, 108,  15, 109,   3, 110,   2,  26,
       111,  13,  25,  11,   6,  27,   2,  26, 112,   7, 113,  16,  28,
       114, 115, 116, 117,  29,   1, 118,  30,  10,  31, 119, 120,  15,
       121, 122, 123,   2,   4, 124,   2,  32,  33, 125,  34,   5,   1,
       126,   1,  20, 127,   8,  15, 128,   7,  16,  28, 129,  1

In [52]:
y=np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [55]:
y[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

In [85]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100))
model.build(input_shape=(None, max_sequence_len - 1))  # Correct usage of build()
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.summary()


In [None]:
# https://github.com/UnfoldDataScience/YouTube-Videos-files/blob/main/LSTM%20video%20files/LSTM%20implementation.ipynb

In [88]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 220ms/step - accuracy: 0.0881 - loss: 4.6175
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 212ms/step - accuracy: 0.1242 - loss: 4.4359
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 158ms/step - accuracy: 0.1016 - loss: 4.3431
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 156ms/step - accuracy: 0.1147 - loss: 4.1816
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 349ms/step - accuracy: 0.1361 - loss: 3.9699
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 185ms/step - accuracy: 0.1236 - loss: 3.8667
Epoch 7/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 173ms/step - accuracy: 0.1639 - loss: 3.7586
Epoch 8/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 177ms/step - accuracy: 0.1390 - loss: 3.6284
Epoch 9/100
[1m10/10[0m [32m━

<keras.src.callbacks.history.History at 0x11d709da310>

In [91]:
input_text = "Monica Yunus"
predict_next_words= 20

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

[34, 5]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 681ms/step
[34, 5, 41]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[34, 5, 41, 8]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[34, 5, 41, 8, 3]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[34, 5, 41, 8, 3, 42]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[34, 5, 41, 8, 3, 42, 6]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[34, 5, 41, 8, 3, 42, 6, 18]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[34, 5, 41, 8, 3, 42, 6, 18, 43]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[34, 5, 41, 8, 3, 42, 6, 18, 43, 4]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[34, 5, 41, 8, 3, 42, 6, 18, 43, 4, 44]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[34, 5, 41, 8, 3, 42, 6, 18, 43, 4, 44, 