In [None]:
! pip install wikipedia

## Imports

In [None]:
import tensorflow as tf
import wikipedia
import re

## Loading a Wikipedia Article and Preprocessing it

In [3]:
docs = wikipedia.page("Egypt")

docs = re.sub(r'[^a-zA-Z\s]', '', docs.content)  # Remove punctuation and digits
docs = docs.lower().strip()
docs = re.sub(r'\s+', ' ', docs)
words = re.findall(r'\b\w+\b', docs)

## Extracting the Unique Words in the Document (Article)

In [4]:
# The unique words in the document
vocab = sorted(set(words))
print(f'{len(vocab)} unique words')

3261 unique words


## Mapping and Splitting

In [5]:
# Mapping from words to indices and vice versa
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Convert text to sequences of word indices
word_indices = [word_to_idx[word] for word in words]

## Preparing the Dataset to Feed the Model with it

In [6]:
sequence_length = 100
BATCH_SIZE = 64

# Create sequences of word indices
sequences = [word_indices[i:i+sequence_length+1] for i in range(len(word_indices)-sequence_length)]

# Pad sequences to have the same length
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')

# Convert padded_sequences to a dataset
dataset = tf.data.Dataset.from_tensor_slices((padded_sequences[:, :-1], padded_sequences[:, 1:]))

# Shuffle and batch the dataset
dataset = dataset.shuffle(buffer_size=len(sequences)).batch(BATCH_SIZE)

## Defining and Training the Word-Based RNN Model

In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(vocab), output_dim=256),
    tf.keras.layers.SimpleRNN(1024, return_sequences=True, return_state=False, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(len(vocab))
])

model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
history = model.fit(dataset, epochs=10)

Epoch 1/10
[1m  3/207[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 52ms/step - loss: 8.0462

I0000 00:00:1715275988.351917     116 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1715275988.415370     116 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 59ms/step - loss: 5.9373
Epoch 2/10
[1m  2/207[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 51ms/step - loss: 1.1399

W0000 00:00:1715276000.431542     116 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1715276000.472133     116 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.6532
Epoch 3/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.1470
Epoch 4/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.0942
Epoch 5/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.0723
Epoch 6/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.0604
Epoch 7/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.0538
Epoch 8/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.0495
Epoch 9/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.0462
Epoch 10/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - loss: 0.0452


## Defining the Word-Based Text Generation Function

In [17]:
# Reset the states of the model
def reset_states(model):
    for layer in model.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

def generate_text(model, start_string, num_generate=100, temperature=1.0):
    input_eval = [word_to_idx[word] for word in start_string.split()]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    reset_states(model)
    
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx_to_word[predicted_id])

    return start_string + ' '.join(text_generated)

## Generating Text Using the Trained Model

In [18]:
start_string = 'egypt'
generated_text = generate_text(model, start_string=start_string, num_generate=100, temperature=0.6)
print(generated_text)

egyptis a referendum during the s giving a surprise attack on january which was invaded egypt has been competitive in the british protectorate of the ottoman turks greeks bedouin arab world the french forces had ruled egypt was forced to the establishment of the next six million egyptians represented by the egyptian deep and the gaza strip in egypt has two strands of the century egypt has been described egypt was known as the total inhabited area of the egyptian museum and alifa rifaat who had captured alexandria in the gaza strip in the largest collection of the government as
