In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [3]:
with open("text.txt", "r", encoding="utf-8") as myfile:
    mytext = myfile.read()


mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1
mytokenizer.word_index

{'the': 1,
 'to': 2,
 'india': 3,
 'in': 4,
 'a': 5,
 'and': 6,
 'of': 7,
 "'s": 8,
 'n': 9,
 'mr': 10,
 'modi': 11,
 'us': 12,
 'is': 13,
 'has': 14,
 'that': 15,
 'biden': 16,
 'washington': 17,
 'it': 18,
 'as': 19,
 'with': 20,
 'not': 21,
 'for': 22,
 'on': 23,
 'an': 24,
 'says': 25,
 'will': 26,
 'are': 27,
 'visit': 28,
 'indian': 29,
 'prime': 30,
 'minister': 31,
 'nthe': 32,
 'china': 33,
 'have': 34,
 'trade': 35,
 'by': 36,
 'relationship': 37,
 'this': 38,
 'been': 39,
 'about': 40,
 'strategic': 41,
 'president': 42,
 'more': 43,
 'up': 44,
 'during': 45,
 'also': 46,
 'make': 47,
 'sirohi': 48,
 'nbut': 49,
 'air': 50,
 'force': 51,
 'they': 52,
 'state': 53,
 'his': 54,
 'world': 55,
 'ties': 56,
 'two': 57,
 'at': 58,
 'white': 59,
 'house': 60,
 'be': 61,
 'lot': 62,
 'technology': 63,
 'jet': 64,
 'but': 65,
 'semiconductor': 66,
 'now': 67,
 'first': 68,
 '2023': 69,
 'administration': 70,
 'was': 71,
 'global': 72,
 'one': 73,
 'he': 74,
 'without': 75,
 'narendra

In [4]:
my_input_sequences = []
for line in mytext.split("\n"):
    # print(line)
    token_list = mytokenizer.texts_to_sequences([line])[0]
    # print(token_list)
    for i in range(1, len(token_list)):
        my_n_gram_sequence = token_list[: i + 1]
        # print(my_n_gram_sequence)
        my_input_sequences.append(my_n_gram_sequence)
        # print(input_sequences)
max_sequence_len = max([len(seq) for seq in my_input_sequences])
input_sequences = np.array(
    pad_sequences(my_input_sequences, maxlen=max_sequence_len, padding="pre")
)

In [5]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))


model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
model.add(LSTM(150))
model.add(Dense(total_words, activation="softmax"))
print(model.summary())


model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X, y, epochs=100, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1460, 100)         60200     
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 602)               90902     
                                                                 
Total params: 301,702
Trainable params: 301,702
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100


2024-01-19 12:22:21.899707: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-01-19 12:22:21.900404: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-01-19 12:22:21.900795: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
 5/46 [==>...........................] - ETA: 39s - loss: 5.1702 - accuracy: 0.0688

In [None]:
def nwp(input_text: str, predict_next_words: int) -> str:
    for _ in range(predict_next_words):
        token_list = mytokenizer.texts_to_sequences([input_text])[0]
        print(token_list)
        token_list = pad_sequences(
            [token_list], maxlen=max_sequence_len - 1, padding="pre"
        )
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in mytokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        input_text += " " + output_word
        return input_text

In [None]:
print(nwp("Hello sir", 10))