In [93]:
from tensorflow.keras import layers, models, callbacks as c
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.utils import to_categorical

In [53]:
path= "../input/next-word-dataset/content.txt"

with open(path) as f:
    content = f.read().lower()
print(f"This document contains {len(content)} strings")


In [54]:
splitter = RegexpTokenizer(r"[a-z]+")
tokens = splitter.tokenize(content)
tokenized_text = " ".join(tokens)
print(tokenized_text[:899])

In [70]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
encoded_sequence = tokenizer.texts_to_sequences(tokens)
print(f"{len(tokenizer.word_index)} unique words found!")

In [74]:
feature_length = 6
X, Y = [], []
for i in range(len(tokens)-feature_length):
    X.append(encoded_sequence[i:i+feature_length])
    Y.append(encoded_sequence[i+feature_length])
print(f"We have a {len(X)} sample for X")
print(f"We have a {len(Y)} sample for Y")

In [90]:
np.shape(X)

In [75]:
unique_words_length = len(np.unique(Y))
print(f"{unique_words_length} of words in the targets values")

In [78]:
y = to_categorical(Y)

In [92]:
model = models.Sequential()
model.add(layers.Embedding(unique_words_length, 128, input_length=feature_length))
model.add(layers.Conv1D(128, 3, activation="relu"))
model.add(layers.MaxPool1D(2))
model.add(layers.Conv1D(64, 2, activation="relu"))
model.add(layers.Flatten())
model.add(layers.Dense(128, activation="relu"))
model.add(layers.Dense(unique_words_length, activation="softmax"))
model.summary()

In [None]:
callbacks_1 = c.EarlyStopping(patience=3)
callbacks_2 = c.ReduceLROnPlateau(patience=2)
callbacks_3 = c.ModelCheckpoint('/tmp/checkpoint')

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X, y, validation_split=0.2, epochs=20, 
          callbacks=[callbacks_1, callbacks_2, callbacks_3], 
          batch_size=128)
