In [3]:
#import required libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU

In [4]:
#read dataset
with open('dialogues.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [5]:
#Tokenizer process
tokenizer = Tokenizer()
#fit
tokenizer.fit_on_texts([text])
#assign length of word index
total_words = len(tokenizer.word_index) + 1

In [6]:
#chek the tokens
tokenizer.word_index

{'the': 1,
 'you': 2,
 'i': 3,
 'to': 4,
 'a': 5,
 'of': 6,
 'it': 7,
 'in': 8,
 'and': 9,
 'that': 10,
 'is': 11,
 'we': 12,
 'your': 13,
 'this': 14,
 'me': 15,
 'for': 16,
 'what': 17,
 'have': 18,
 'on': 19,
 'not': 20,
 'he': 21,
 'be': 22,
 'do': 23,
 'no': 24,
 'know': 25,
 'my': 26,
 'was': 27,
 "it's": 28,
 "i'm": 29,
 'with': 30,
 'but': 31,
 "don't": 32,
 'him': 33,
 'are': 34,
 'can': 35,
 'get': 36,
 'if': 37,
 "you're": 38,
 'about': 39,
 'here': 40,
 'just': 41,
 'one': 42,
 'they': 43,
 'all': 44,
 'his': 45,
 'at': 46,
 'up': 47,
 'go': 48,
 'so': 49,
 'out': 50,
 'now': 51,
 'right': 52,
 'there': 53,
 'as': 54,
 'how': 55,
 'well': 56,
 'come': 57,
 'like': 58,
 'back': 59,
 "he's": 60,
 'who': 61,
 "that's": 62,
 'mr': 63,
 'see': 64,
 'think': 65,
 'from': 66,
 'did': 67,
 'need': 68,
 "can't": 69,
 'gonna': 70,
 'an': 71,
 'why': 72,
 'them': 73,
 'when': 74,
 'take': 75,
 'us': 76,
 'our': 77,
 'her': 78,
 'would': 79,
 'man': 80,
 'will': 81,
 'want': 82,
 'then

In [7]:
#declare ngrams
input_sequences = []
#split the sentence from '\n'
for line in text.split('\n'):
    #get tokens
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [8]:
setence_token = input_sequences[3] # [1, 1561, 5, 129, 34]
sentence = []
for token in setence_token:
    sentence.append(list((tokenizer.word_index).keys())[list((tokenizer.word_index).values()).index(token)])
print(sentence)

['bruce', 'rachel', 'let', 'me', 'see']


In [9]:
#maximum sentence length
max_sequence_len = max([len(seq) for seq in input_sequences])
# input sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [10]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [11]:
#convert one-hot-encode
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [12]:
#create model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(GRU(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())



None


In [129]:
#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#fit the model
model.fit(X, y, epochs=5, verbose=1)

Epoch 1/5
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 68ms/step - accuracy: 0.6409 - loss: 1.6760
Epoch 2/5
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 67ms/step - accuracy: 0.6647 - loss: 1.5743
Epoch 3/5
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 65ms/step - accuracy: 0.6740 - loss: 1.4960
Epoch 4/5
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 66ms/step - accuracy: 0.6816 - loss: 1.4352
Epoch 5/5
[1m1916/1916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 66ms/step - accuracy: 0.6951 - loss: 1.3695


<keras.src.callbacks.history.History at 0x1ebd4b65130>

In [248]:
model.save("nolan_69.h5")



In [236]:
seed_text = '''

batman is the hero


'''.strip()
seed_text

'batman is the hero'

In [247]:
#determine a text

# predict word number
next_words = 1

for _ in range(next_words):
    #convert to token
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    #path sequences
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    #model prediction
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    # get predict words
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

seed_text

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


"batman is the hero we deserved to understand the bomb of quantum mechanics of what's"