<a href="https://colab.research.google.com/github/L0ki2026/Neural-Networks/blob/main/language_model_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import nltk
import re
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

nltk.download('punkt_tab')
nltk.download('gutenberg')

# Load and clean text
text = gutenberg.raw('austen-emma.txt').lower()
text = re.sub(r"[^a-zA-Z\s']", '', text)
tokens = word_tokenize(text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
T = 5  # sequence length

tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
sequences = tokenizer.texts_to_sequences(tokens)  # [[105], [28], ...]

word_ids = [item[0] for item in sequences if item]  # flatten

X = []
y = []
for i in range(len(word_ids) - T):
    X.append(word_ids[i:i+T-1])  # input: T-1 words
    y.append(word_ids[i+T-1])    # label: Tth word

X = np.array(X)
y = np.array(y)

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=T-1),
    SimpleRNN(128, return_sequences=False),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [5]:
model.fit(X, y, epochs=10, batch_size=128)

Epoch 1/10
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.0366 - loss: 6.7646
Epoch 2/10
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.1056 - loss: 5.5915
Epoch 3/10
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.1349 - loss: 5.1763
Epoch 4/10
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.1520 - loss: 4.8965
Epoch 5/10
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.1653 - loss: 4.6756
Epoch 6/10
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.1734 - loss: 4.4859
Epoch 7/10
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.1879 - loss: 4.3062
Epoch 8/10
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.2015 - loss: 4.1505
Epoch 9/10
[1m1245/1245

<keras.src.callbacks.history.History at 0x7f623e59c510>

In [6]:
def generate_text_rnn(seed_text, num_words=20):
    result = seed_text.split()

    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([result[-(T-1):]])[0]
        token_list = pad_sequences([token_list], maxlen=T-1)
        predicted_id = np.argmax(model.predict(token_list, verbose=0))
        next_word = tokenizer.index_word.get(predicted_id, '[UNK]')
        result.append(next_word)

    return ' '.join(result)

In [7]:
print(generate_text_rnn("emma was very"))

emma was very much pleased with her and as to be acquainted with her to be sure of the first time and the
