In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
with open('1000_sentences.txt','r') as file:
    text = file.read().lower()

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) +1
total_words

988

In [4]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'a': 3,
 'and': 4,
 'i': 5,
 'us': 6,
 'let': 7,
 'of': 8,
 'am': 9,
 'for': 10,
 'is': 11,
 'on': 12,
 'in': 13,
 'with': 14,
 'this': 15,
 'new': 16,
 'can': 17,
 'my': 18,
 'need': 19,
 'you': 20,
 'our': 21,
 'time': 22,
 'your': 23,
 'are': 24,
 'essential': 25,
 'an': 26,
 'team': 27,
 'by': 28,
 'learning': 29,
 'was': 30,
 'helps': 31,
 'help': 32,
 'code': 33,
 'up': 34,
 'have': 35,
 'improve': 36,
 'thinking': 37,
 'often': 38,
 'make': 39,
 'check': 40,
 'project': 41,
 'over': 42,
 'going': 43,
 'into': 44,
 'it': 45,
 'that': 46,
 'health': 47,
 'diseases': 48,
 'used': 49,
 'clear': 50,
 'as': 51,
 'go': 52,
 'how': 53,
 'will': 54,
 'different': 55,
 'we': 56,
 'planning': 57,
 'local': 58,
 'solar': 59,
 'system': 60,
 'study': 61,
 'games': 62,
 'require': 63,
 'experience': 64,
 'programming': 65,
 'lead': 66,
 'every': 67,
 'during': 68,
 'work': 69,
 'year': 70,
 'more': 71,
 'some': 72,
 'goals': 73,
 'small': 74,
 'development': 75,
 'day': 

In [5]:
input_sequence = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequence.append(n_gram_sequence)

In [6]:
input_sequence

[[5, 522],
 [5, 522, 34],
 [5, 522, 34, 124],
 [5, 522, 34, 124, 596],
 [5, 522, 34, 124, 596, 9],
 [5, 522, 34, 124, 596, 9, 67],
 [5, 522, 34, 124, 596, 9, 67, 76],
 [5, 523],
 [5, 523, 125],
 [5, 523, 125, 18],
 [5, 523, 125, 18, 213],
 [5, 523, 125, 18, 213, 14],
 [5, 523, 125, 18, 213, 14, 3],
 [5, 523, 125, 18, 213, 14, 3, 597],
 [5, 523, 125, 18, 213, 14, 3, 597, 8],
 [5, 523, 125, 18, 213, 14, 3, 597, 8, 598],
 [5, 126],
 [5, 126, 2],
 [5, 126, 2, 39],
 [5, 126, 2, 39, 18],
 [5, 126, 2, 39, 18, 599],
 [5, 126, 2, 39, 18, 599, 51],
 [5, 126, 2, 39, 18, 599, 51, 600],
 [5, 126, 2, 39, 18, 599, 51, 600, 51],
 [5, 126, 2, 39, 18, 599, 51, 600, 51, 5],
 [5, 126, 2, 39, 18, 599, 51, 600, 51, 5, 522],
 [5, 126, 2, 39, 18, 599, 51, 600, 51, 5, 522, 34],
 [5, 9],
 [5, 9, 43],
 [5, 9, 43, 2],
 [5, 9, 43, 2, 77],
 [5, 9, 43, 2, 77, 3],
 [5, 9, 43, 2, 77, 3, 601],
 [5, 9, 43, 2, 77, 3, 601, 602],
 [7, 6],
 [7, 6, 52],
 [7, 6, 52, 10],
 [7, 6, 52, 10, 3],
 [7, 6, 52, 10, 3, 214],
 [7, 6, 52

In [7]:
max_sequence_length = max([len(x) for x in input_sequence])
max_sequence_length 

14

In [8]:
input_sequence = np.array(pad_sequences(input_sequence,maxlen = max_sequence_length,padding = 'pre'))
input_sequence

array([[  0,   0,   0, ...,   0,   5, 522],
       [  0,   0,   0, ...,   5, 522,  34],
       [  0,   0,   0, ..., 522,  34, 124],
       ...,
       [  0,   0,   0, ...,  75,   8,  16],
       [  0,   0,   0, ...,   8,  16, 520],
       [  0,   0,   0, ...,  16, 520, 521]])

In [9]:
import tensorflow as tf

In [10]:
x = input_sequence[:,:-1]
y = input_sequence[:,-1]

In [11]:
x,y

(array([[  0,   0,   0, ...,   0,   0,   5],
        [  0,   0,   0, ...,   0,   5, 522],
        [  0,   0,   0, ...,   5, 522,  34],
        ...,
        [  0,   0,   0, ...,   1,  75,   8],
        [  0,   0,   0, ...,  75,   8,  16],
        [  0,   0,   0, ...,   8,  16, 520]]),
 array([522,  34, 124, ...,  16, 520, 521]))

In [12]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Embedding,Dense,Dropout

model = Sequential()
model.add(Embedding(total_words,100,input_length = max_sequence_length -1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

model.compile(loss = 'categorical_crossentropy',optimizer='adam',metrics = ['accuracy'])
model.build(input_shape=(None,max_sequence_length -1))
model.summary()



In [18]:
history = model.fit(x_train,y_train,epochs = 50,validation_data = (x_test,y_test),verbose = 1)

Epoch 1/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - accuracy: 0.8990 - loss: 0.3708 - val_accuracy: 0.5824 - val_loss: 5.2487
Epoch 2/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - accuracy: 0.8992 - loss: 0.3731 - val_accuracy: 0.5811 - val_loss: 5.2602
Epoch 3/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - accuracy: 0.9011 - loss: 0.3739 - val_accuracy: 0.5824 - val_loss: 5.2508
Epoch 4/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - accuracy: 0.9006 - loss: 0.3652 - val_accuracy: 0.5865 - val_loss: 5.2543
Epoch 5/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - accuracy: 0.9009 - loss: 0.3527 - val_accuracy: 0.5824 - val_loss: 5.2690
Epoch 6/50
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - accuracy: 0.8998 - loss: 0.3599 - val_accuracy: 0.5797 - val_loss: 5.3128
Epoch 7/50
[1m93/93[0m [32m━━━━

In [29]:
input_text = input("input text:")
words = int(input("input number of words:"))
for a in range(1,words):
    token_list = tokenizer.texts_to_sequences([input_text])[0]
    token_list = pad_sequences([token_list],maxlen=max_sequence_length-1,padding = 'pre')
    predicted = np.argmax(model.predict(token_list),axis =-1)

    for word,index in tokenizer.word_index.items():
        if index == predicted:
            input_text = input_text+ " " + word
            
print(input_text)

input text: Chemistry is the study of matter and its
input number of words: 2


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
Chemistry is the study of matter and its interactions
