In [None]:
! pip install wikipedia

## Imports

In [None]:
import tensorflow as tf
import wikipedia
import re

## Loading a Wikipedia Article and Preprocessing it

In [3]:
# Loading a wikipedia article about Egypt
docs = wikipedia.page("Egypt")

# Preprocessing the article
docs = re.sub(r'[^a-zA-Z\s]', '', docs.content)  # Remove punctuation and digits
docs = docs.lower().strip()
docs = re.sub(r'\s+', ' ', docs)

## Extracting the Unique Characters in the Document (Article)

In [4]:
vocab = sorted(set(docs))
print(f'{len(vocab)} unique characters')

27 unique characters


## Encoding the Unique Characters

In [5]:
chars = tf.strings.unicode_split(docs, input_encoding='UTF-8')
print(chars)

tf.Tensor([b'e' b'g' b'y' ... b'y' b'p' b't'], shape=(82864,), dtype=string)


## Mapping From Unique Characters to Indices

In [6]:
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)
ids = ids_from_chars(chars)
print(ids)

tf.Tensor([ 6  8 26 ... 26 17 21], shape=(82864,), dtype=int64)


## Mapping the Indices Back to Characters

In [7]:
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
chars = chars_from_ids(ids)
print(chars)

tf.Tensor([b'e' b'g' b'y' ... b'y' b'p' b't'], shape=(82864,), dtype=string)


## Preparing the Dataset to Feed the Model with it

In [8]:
# Creates a TensorFlow dataset from the Ids tensor. Each element in Ids will be treated as a separate element in the dataset.
char_dataset = tf.data.Dataset.from_tensor_slices(ids)

# Batch the dataset such that each element in the dataset is of length 100
sequence_length = 100
sequences = char_dataset.batch(sequence_length + 1, drop_remainder=True)

# split the chunk of given text into input and target sequences so that the model can learn to predict the next character in the sequence.
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# Applying the split_input_target function to the dataset
dataset = sequences.map(split_input_target)

# Batch the dataset
BATCH_SIZE = 64
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## Defining and Training the Character-Based RNN Model

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(vocab)+1, output_dim=256),
    tf.keras.layers.SimpleRNN(1024, return_sequences=True, return_state=False, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(len(vocab)+1)
])
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
history = model.fit(dataset, epochs=40)
model.summary()

Epoch 1/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 896ms/step - loss: 3.3752
Epoch 2/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 931ms/step - loss: 2.7992
Epoch 3/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 912ms/step - loss: 2.5752
Epoch 4/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 941ms/step - loss: 2.4193
Epoch 5/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 892ms/step - loss: 2.3272
Epoch 6/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 890ms/step - loss: 2.2596
Epoch 7/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 923ms/step - loss: 2.2180
Epoch 8/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 894ms/step - loss: 2.1702
Epoch 9/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 929ms/step - loss: 2.1325
Epoch 10/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 897ms

## Defining the Character-Based Text Generation Function

In [14]:
# Reset the states of the model
def reset_states(model):
    for layer in model.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

# Define a function to generate text
def generate_text(model, start_string, num_generate=1000, temperature=1.0):
    input_eval = [ids_from_chars([s]) for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    reset_states(model)

    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        # Use a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(chars_from_ids([predicted_id]).numpy()[0].decode('utf-8'))

    return start_string + ''.join(text_generated)

## Generating Text Using the Trained Model

In [20]:
start_string = 'The Pyramids'
generated_text = generate_text(model, start_string=start_string, num_generate=500, temperature=0.6)
print(generated_text)

The Pyramids w ad athenthe thed alathethexeded the rid coris d aten n atusthes tet ede rist therod tw mond ithed aks s e lin ce s o asthen tites an s hand ed res cestind aly th a fralind on cere ly thende athalis thaly tithubby ches basth tin ale n ced the wen at hend ive d it tin athend tivexe gheneratha megutis in thind thestend onexa axalis the jorinirin rok ff the this thas azare t wathe ce as thes heres wats are pre w a tinthathe atin t ach ithin porind ond ome sthaks se alin cond w we cona tw in ineri
