In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
with open('IndiaUS.txt', 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1

In [5]:
total_words

599

In [6]:
my_input_seq = []

for line in mytext.split('\n'):
    print(line)
    token_list = mytokenizer.texts_to_sequences([line])[0]
    print(token_list)
    
    for i in range(1, len(token_list)):
        my_n_gram_seq = token_list[:i+1]
        
        my_input_seq.append(my_n_gram_seq)

Following a lavish state visit by Indian Prime Minister Narendra Modi to Washington, US President Joe Biden has called his country's partnership with India among the "most consequential in the world". The BBC's Vikas Pandey and Soutik Biswas explore the factors that contribute to the visit's potential in strengthening the ties between the two nations.
[99, 4, 177, 50, 34, 35, 28, 29, 30, 71, 11, 2, 22, 9, 36, 72, 14, 12, 178, 51, 100, 101, 16, 6, 179, 1, 102, 180, 3, 1, 73, 1, 181, 182, 183, 5, 184, 185, 186, 1, 187, 13, 188, 2, 1, 189, 74, 3, 103, 1, 52, 104, 1, 53, 105]

[]
The US's relationship with India - the world's most populous country - is "stronger, closer and more dynamic than any time in history", Mr Biden said at the completion of a pomp-filled state visit by Mr Modi to the White House.
[1, 190, 37, 16, 6, 1, 191, 102, 192, 193, 10, 194, 106, 5, 41, 195, 107, 196, 108, 3, 197, 8, 14, 109, 54, 1, 198, 7, 4, 199, 200, 50, 34, 35, 8, 11, 2, 1, 55, 56]

[]
The remark may not b

In [7]:
max_sequence_len = 0
for line in mytext.split('\n'):
    max_sequence_len = max(len(line.split()),max_sequence_len)
    
print(max_sequence_len)

83


In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
input_sequences = np.array(pad_sequences(my_input_seq, maxlen=max_sequence_len, padding='pre'))

In [9]:
print(input_sequences)

[[  0   0   0 ...   0  99   4]
 [  0   0   0 ...  99   4 177]
 [  0   0   0 ...   4 177  50]
 ...
 [  0   0   0 ... 176 598  25]
 [  0   0   0 ... 598  25  59]
 [  0   0   0 ...  25  59  46]]


In [10]:
X = input_sequences[:,:-1]
y = input_sequences[:,-1]

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM

In [12]:
model = Sequential()

In [13]:
model

<Sequential name=sequential, built=False>

In [14]:
total_words

599

In [22]:
import keras
input_layer = keras.Input(shape=(max_sequence_len-1,), dtype=tf.float32)

model = Sequential()
model.add(tf.keras.layers.Embedding(input_dim=total_words, output_dim=100))
model.add(tf.keras.layers.LSTM(150))
model.add(tf.keras.layers.Dense(total_words, activation='softmax'))

# Build the model by specifying the input layer
model.build(input_shape=input_layer.shape)
model.summary()

In [23]:

model.input_shape

(None, 82)

In [24]:
model.output_shape

(None, 599)

In [25]:
Y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [26]:

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, Y, epochs=100, verbose=1)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 47ms/step - accuracy: 0.0444 - loss: 6.2865
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 59ms/step - accuracy: 0.0612 - loss: 5.7199
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step - accuracy: 0.0610 - loss: 5.6448
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 62ms/step - accuracy: 0.0566 - loss: 5.7065
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 59ms/step - accuracy: 0.0746 - loss: 5.5057
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.0517 - loss: 5.4899
Epoch 7/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step - accuracy: 0.0825 - loss: 5.2965
Epoch 8/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.0958 - loss: 5.0810
Epoch 9/100
[1m43/43[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x2512d5ab530>

In [27]:
input_text = "Joe biden"
predict_next_words= 15

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

[72, 14]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step
[72, 14, 5]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[72, 14, 5, 28]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[72, 14, 5, 28, 29]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[72, 14, 5, 28, 29, 30]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[72, 14, 5, 28, 29, 30, 71]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[72, 14, 5, 28, 29, 30, 71, 11]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[72, 14, 5, 28, 29, 30, 71, 11, 536]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[72, 14, 5, 28, 29, 30, 71, 11, 536, 3]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[72, 14, 5, 28, 29, 30, 71, 11, 536, 3, 23]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[72, 14, 5, 28, 2

In [28]:
model.save('my_model.h5')



In [29]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(mytokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)