In [3]:
import pandas as pd
import numpy as np

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [5]:
df = pd.read_csv('/content/grammer.csv')
text = [x for x,y in zip(df.input.values, df.labels.values) if y==1]


In [6]:
corpus = [line.split(" ") for line in text]
corpus = corpus[:2000]

In [7]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(corpus)
print(tokenizer.word_index)

{'<OOV>': 1, '.': 2, 'i': 3, ',': 4, 'the': 5, 'to': 6, 'a': 7, 'is': 8, 'and': 9, 'in': 10, 'it': 11, 'of': 12, 'my': 13, 'not': 14, 'for': 15, 'was': 16, '!': 17, 'have': 18, 'that': 19, '': 20, 'you': 21, 'this': 22, 'so': 23, 'but': 24, '?': 25, 'with': 26, 'english': 27, 'are': 28, 'we': 29, 'me': 30, 'do': 31, 'am': 32, 'will': 33, 'be': 34, 'about': 35, 'at': 36, 'time': 37, 'he': 38, '-': 39, 'very': 40, 'on': 41, 'like': 42, 'they': 43, 'there': 44, 'can': 45, 'people': 46, 'think': 47, 'one': 48, 'as': 49, 'when': 50, 'had': 51, 'if': 52, 'because': 53, 'want': 54, 'or': 55, 'some': 56, 'more': 57, 'an': 58, 'all': 59, 'by': 60, 'japanese': 61, 'go': 62, 'today': 63, 'could': 64, 'good': 65, 'what': 66, 'she': 67, "'": 68, 'them': 69, 'their': 70, 'now': 71, 'who': 72, 'day': 73, 'from': 74, 'know': 75, 'would': 76, 'many': 77, 'up': 78, 'has': 79, 'did': 80, 'our': 81, 'get': 82, 'his': 83, 'went': 84, 'been': 85, 'also': 86, 'which': 87, 'how': 88, 'your': 89, 'her': 90, 'f

In [8]:
total_words = len(tokenizer.word_index) + 1
print(f"total words:{total_words}")

total words:3284


In [9]:
sequences = tokenizer.texts_to_sequences(corpus)
padded = pad_sequences(sequences,padding="post")

In [10]:
input_sequences = []
labels = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i]
        input_sequences.append(n_gram_sequence)
        labels.append(token_list[i])

In [11]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,maxlen = max_sequence_len, padding = 'pre'))

In [12]:
xs = input_sequences
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [13]:
# #train Neural Network
model = Sequential()
model.add(Embedding(input_dim =total_words, output_dim=240,input_shape =(max_sequence_len,)))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words,activation = "softmax"))


  super().__init__(**kwargs)


In [17]:
adam = Adam(learning_rate=0.01)
model.compile(loss = 'categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(xs, ys, epochs=50)


Epoch 1/50
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 311ms/step - accuracy: 0.2168 - loss: 4.7789
Epoch 2/50
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 308ms/step - accuracy: 0.2582 - loss: 3.8364
Epoch 3/50
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 306ms/step - accuracy: 0.2625 - loss: 3.8712
Epoch 4/50
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 307ms/step - accuracy: 0.2784 - loss: 3.7829
Epoch 5/50
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 308ms/step - accuracy: 0.3042 - loss: 3.3772
Epoch 6/50
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 310ms/step - accuracy: 0.3226 - loss: 3.2168
Epoch 7/50
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 311ms/step - accuracy: 0.3387 - loss: 3.0833
Epoch 8/50
[1m666/666[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 308ms/step - accuracy: 0.3530 - loss: 2.9587
Epoch 9/

In [None]:
while True:
    seed_text = input("Enter sentence:")
    if seed_text == "exit": #enter q00 to stop
        break

    next_words = 10

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen = max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text+= " " + output_word
        if output_word == '.':
            break

    print(seed_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 484ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
i need to go any of working .
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━