<a href="https://colab.research.google.com/github/MohamedSamirHassanPhD/Samir-character-based-LM-using-keras/blob/main/Samir_character_based_LM_using%C2%A0keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install TensorFlow for Keras
!pip install tensorflow




In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example corpus
text = '''Mohamed  quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.'''

# Create a set of characters
chars = sorted(set(text))
print(f'Number of unique characters: {len(chars)}')

# Create mappings from characters to integers and vice versa
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

# Convert text to integers
encoded_text = [char_to_int[ch] for ch in text]
print(f'Encoded text: {encoded_text[:50]}')


Number of unique characters: 30
Encoded text: [2, 18, 11, 4, 16, 8, 7, 0, 0, 20, 24, 12, 6, 14, 0, 5, 21, 18, 26, 17, 0, 9, 18, 27, 0, 13, 24, 16, 19, 22, 0, 18, 25, 8, 21, 0, 23, 11, 8, 0, 15, 4, 29, 28, 0, 7, 18, 10, 1, 0]


In [3]:
seq_length = 20  # length of the input sequences
X = []
y = []

# Create input/output pairs
for i in range(0, len(encoded_text) - seq_length, 1):
    X.append(encoded_text[i:i + seq_length])
    y.append(encoded_text[i + seq_length])

# Reshape X for LSTM input
X = np.array(X)
X = X / float(len(chars))  # Normalize the input data (scale between 0 and 1)

# One-hot encode the output labels
y = tf.keras.utils.to_categorical(y, num_classes=len(chars))

print(f'X shape: {X.shape}, y shape: {y.shape}')


X shape: (74, 20), y shape: (74, 30)


In [4]:
# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(128, input_shape=(X.shape[1], 1), return_sequences=True),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(chars), activation='softmax')  # Output layer with softmax
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Reshape X for LSTM input (it expects 3D input)
X = X.reshape((X.shape[0], X.shape[1], 1))

# Train the model
model.fit(X, y, epochs=50, batch_size=64)


  super().__init__(**kwargs)


Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 56ms/step - accuracy: 0.1834 - loss: 3.3943
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.2028 - loss: 3.3640
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.2081 - loss: 3.3202
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.2028 - loss: 3.2442
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.1924 - loss: 3.1788 
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.2081 - loss: 3.1746
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.2028 - loss: 3.1624
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.2028 - loss: 3.1026
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x797e3db227b0>

In [5]:
# Function to generate text based on the trained model
def generate_text(model, seed_text, length=100):
    result = seed_text
    # Generate characters one by one
    for _ in range(length):
        # Convert the seed text into integer sequence
        input_seq = [char_to_int[ch] for ch in seed_text]
        input_seq = np.array(input_seq) / float(len(chars))  # Normalize
        input_seq = input_seq.reshape(1, len(input_seq), 1)

        # Predict the next character
        predicted_probs = model.predict(input_seq, verbose=0)
        predicted_char_index = np.argmax(predicted_probs)

        # Convert the predicted index to a character
        predicted_char = int_to_char[predicted_char_index]
        result += predicted_char

        # Update the seed text to include the new character
        seed_text = seed_text[1:] + predicted_char

    return result

# Generate text with a seed
seed_text = "The quick brown fox"
generated_text = generate_text(model, seed_text, length=100)
print(f'Generated Text: {generated_text}')


Generated Text: The quick brown fox                                                                                                    
