In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
with open('1000_sentences.txt','r') as file:
    text = file.read().lower()

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) +1
total_words

511

In [4]:
tokenizer.word_index

{'i': 1,
 'the': 2,
 'let': 3,
 'us': 4,
 'to': 5,
 'a': 6,
 'am': 7,
 'for': 8,
 'this': 9,
 'on': 10,
 'my': 11,
 'need': 12,
 'new': 13,
 'our': 14,
 'with': 15,
 'of': 16,
 'make': 17,
 'going': 18,
 'go': 19,
 'will': 20,
 'you': 21,
 'up': 22,
 'year': 23,
 'more': 24,
 'project': 25,
 'some': 26,
 'we': 27,
 'in': 28,
 'meeting': 29,
 'take': 30,
 'have': 31,
 'before': 32,
 'how': 33,
 'week': 34,
 'next': 35,
 'planning': 36,
 'at': 37,
 'day': 38,
 'check': 39,
 'me': 40,
 'during': 41,
 'work': 42,
 'sure': 43,
 'working': 44,
 'today': 45,
 'and': 46,
 'try': 47,
 'together': 48,
 'team': 49,
 'taking': 50,
 'trip': 51,
 'every': 52,
 'is': 53,
 'by': 54,
 'learning': 55,
 'out': 56,
 'get': 57,
 'that': 58,
 'looking': 59,
 'can': 60,
 'create': 61,
 'month': 62,
 'vacation': 63,
 'family': 64,
 'kids': 65,
 'tech': 66,
 'morning': 67,
 'walk': 68,
 'set': 69,
 'want': 70,
 'plan': 71,
 'business': 72,
 'end': 73,
 'improving': 74,
 'skills': 75,
 'creative': 76,
 'schedul

In [5]:
input_sequence = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequence.append(n_gram_sequence)

In [6]:
input_sequence

[[1, 100],
 [1, 100, 22],
 [1, 100, 22, 37],
 [1, 100, 22, 37, 175],
 [1, 100, 22, 37, 175, 7],
 [1, 100, 22, 37, 175, 7, 52],
 [1, 100, 22, 37, 175, 7, 52, 38],
 [1, 176],
 [1, 176, 101],
 [1, 176, 101, 11],
 [1, 176, 101, 11, 67],
 [1, 176, 101, 11, 67, 15],
 [1, 176, 101, 11, 67, 15, 6],
 [1, 176, 101, 11, 67, 15, 6, 177],
 [1, 176, 101, 11, 67, 15, 6, 177, 16],
 [1, 176, 101, 11, 67, 15, 6, 177, 16, 178],
 [1, 179],
 [1, 179, 5],
 [1, 179, 5, 17],
 [1, 179, 5, 17, 11],
 [1, 179, 5, 17, 11, 180],
 [1, 179, 5, 17, 11, 180, 102],
 [1, 179, 5, 17, 11, 180, 102, 181],
 [1, 179, 5, 17, 11, 180, 102, 181, 102],
 [1, 179, 5, 17, 11, 180, 102, 181, 102, 1],
 [1, 179, 5, 17, 11, 180, 102, 181, 102, 1, 100],
 [1, 179, 5, 17, 11, 180, 102, 181, 102, 1, 100, 22],
 [1, 7],
 [1, 7, 18],
 [1, 7, 18, 5],
 [1, 7, 18, 5, 30],
 [1, 7, 18, 5, 30, 6],
 [1, 7, 18, 5, 30, 6, 182],
 [1, 7, 18, 5, 30, 6, 182, 183],
 [3, 4],
 [3, 4, 19],
 [3, 4, 19, 8],
 [3, 4, 19, 8, 6],
 [3, 4, 19, 8, 6, 68],
 [3, 4, 19, 8

In [7]:
max_sequence_length = max([len(x) for x in input_sequence])
max_sequence_length 

13

In [8]:
input_sequence = np.array(pad_sequences(input_sequence,maxlen = max_sequence_length,padding = 'pre'))
input_sequence

array([[  0,   0,   0, ...,   0,   1, 100],
       [  0,   0,   0, ...,   1, 100,  22],
       [  0,   0,   0, ..., 100,  22,  37],
       ...,
       [  0,   0,   0, ...,   1,   7,  95],
       [  0,   0,   0, ...,   7,  95,  16],
       [  0,   0,   0, ...,  95,  16, 510]])

In [9]:
import tensorflow as tf

In [10]:
x = input_sequence[:,:-1]
y = input_sequence[:,-1]

In [11]:
x,y

(array([[  0,   0,   0, ...,   0,   0,   1],
        [  0,   0,   0, ...,   0,   1, 100],
        [  0,   0,   0, ...,   1, 100,  22],
        ...,
        [  0,   0,   0, ...,   0,   1,   7],
        [  0,   0,   0, ...,   1,   7,  95],
        [  0,   0,   0, ...,   7,  95,  16]]),
 array([100,  22,  37, ...,  95,  16, 510]))

In [12]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Embedding,Dense,Dropout

model = Sequential()
model.add(Embedding(total_words,100,input_length = max_sequence_length -1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

model.compile(loss = 'categorical_crossentropy',optimizer='adam',metrics = ['accuracy'])
model.build(input_shape=(None,max_sequence_length -1))
model.summary()



In [18]:
history = model.fit(x_train,y_train,epochs = 50,validation_data = (x_test,y_test),verbose = 1)

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step - accuracy: 0.8059 - loss: 0.6939 - val_accuracy: 0.1916 - val_loss: 8.2536
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.8050 - loss: 0.6899 - val_accuracy: 0.1882 - val_loss: 8.3107
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.8083 - loss: 0.7027 - val_accuracy: 0.1812 - val_loss: 8.3120
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.8136 - loss: 0.6872 - val_accuracy: 0.1916 - val_loss: 8.3171
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.8048 - loss: 0.7056 - val_accuracy: 0.1916 - val_loss: 8.3532
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.7996 - loss: 0.7361 - val_accuracy: 0.1916 - val_loss: 8.3594
Epoch 7/50
[1m36/36[0m [32m━━━━

In [20]:
input_text = input("input text:")
words = int(input("input number of words:"))
for a in range(1,words):
    token_list = tokenizer.texts_to_sequences([input_text])[0]
    token_list = pad_sequences([token_list],maxlen=max_sequence_length-1,padding = 'pre')
    predicted = np.argmax(model.predict(token_list),axis =-1)

    for word,index in tokenizer.word_index.items():
        if index == predicted:
            input_text = input_text+ " " + word
            
    
print(input_text)

input text: I
input number of words: 2


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
I am
