In [2]:
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Embedding,Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re
import pickle


In [3]:
#load data from files
X = np.load('../data/X.npy')
y = np.load('../data/y.npy')
with open('../data/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Calculate variables needed for the model
vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = X.shape[1]
embedding_dim = 100 # Define embedding dimension

print(f"Vocab size: {vocab_size}")
print(f"Sequence length: {max_sequence_length}")
print(f"Embedding dim: {embedding_dim}")

Vocab size: 12848
Sequence length: 50
Embedding dim: 100


# **Train Simple RNN Model**

In [4]:
#now we will train a simple RNN to predict the next word in a sequence of words
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(SimpleRNN(units=128, return_sequences=True))
model.add(SimpleRNN(units=128))
model.add(Dense(units=vocab_size, activation='softmax'))

# Build the model to verify architecture
model.build(input_shape=(None, max_sequence_length))

# Using sparse_categorical_crossentropy since y is not one-hot encoded
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Train the model
history = model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m5065/5065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 76ms/step - accuracy: 0.0426 - loss: 6.7694 - val_accuracy: 0.0586 - val_loss: 7.0077
Epoch 2/10
[1m5065/5065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 102ms/step - accuracy: 0.0736 - loss: 6.2781 - val_accuracy: 0.0680 - val_loss: 7.0744
Epoch 3/10
[1m5065/5065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m458s[0m 90ms/step - accuracy: 0.0848 - loss: 6.0159 - val_accuracy: 0.0729 - val_loss: 7.1399
Epoch 4/10
[1m5065/5065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m729s[0m 135ms/step - accuracy: 0.0939 - loss: 5.7901 - val_accuracy: 0.0770 - val_loss: 7.1618
Epoch 5/10
[1m5065/5065[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m871s[0m 160ms/step - accuracy: 0.1013 - loss: 5.5724 - val_accuracy: 0.0752 - val_loss: 7.1416
Epoch 6/10
[1m2536/5065[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m5:36[0m 133ms/step - accuracy: 0.1121 - loss: 5.2117

In [None]:
# Plot training history
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.legend()
plt.show()

In [None]:
# Save the model
model.save('../saved_models/simple_rnn_model.h5')