In [1]:
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Embedding,Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re
import pickle


In [2]:
#load data from files
X = np.load('../data/X.npy')
y = np.load('../data/y.npy')
with open('../data/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Calculate variables needed for the model
vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = X.shape[1]
embedding_dim = 100 # Define embedding dimension

print(f"Vocab size: {vocab_size}")
print(f"Sequence length: {max_sequence_length}")
print(f"Embedding dim: {embedding_dim}")

Vocab size: 12848
Sequence length: 50
Embedding dim: 100


**Train a LSTM to predict the next word in a sequence of words**

In [3]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))

# LSTM Layer
# return_sequences=False because we want a single vector output (Many-to-One) for the next word
model.add(LSTM(units=128, return_sequences=False)) 
model.add(Dropout(0.2))

model.add(Dense(units=vocab_size, activation='softmax'))

# Build the model to verify architecture
model.build(input_shape=(None, max_sequence_length))

# Using sparse_categorical_crossentropy since y is not one-hot encoded
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X, y, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m2533/2533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m675s[0m 263ms/step - accuracy: 0.0435 - loss: 6.7304 - val_accuracy: 0.0627 - val_loss: 6.7943
Epoch 2/5
[1m2533/2533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m893s[0m 346ms/step - accuracy: 0.0796 - loss: 6.2089 - val_accuracy: 0.0775 - val_loss: 6.7288
Epoch 3/5
[1m1523/2533[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m4:27[0m 264ms/step - accuracy: 0.0906 - loss: 5.9254

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.legend()
plt.show()

In [None]:
model.save('../saved_models/lstm_model.h5')