<a href="https://colab.research.google.com/github/GustavBoye/DRED_Autoencoder/blob/main/PredictNextWord.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout, BatchNormalization, Add, Input, LeakyReLU
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load text from a file
with open("dataset.txt", "r", encoding="utf-8") as file:
    conversation_text = file.read()

print(f"Text length: {len(conversation_text)}")

# 📌 Preprocess text: Convert to lowercase and remove unwanted characters
text = conversation_text.lower()
text = ''.join(c for c in text if c.isalnum() or c in " .,!?")

# 📌 Tokenize the text: Convert words to tokens
tokenizer = Tokenizer(char_level=False)  # Use word-level tokenization
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])[0]

# 📌 Prepare training data (sequence of tokens → next token)
SEQ_LENGTH = 128  # Shorter sequences for quick testing, increase for better training
X_train = []
y_train = []

for i in range(len(sequences) - SEQ_LENGTH):
    seq_in = sequences[i:i+SEQ_LENGTH]   # Sequence of tokens
    seq_out = sequences[i+SEQ_LENGTH]    # Next token

    X_train.append(seq_in)
    y_train.append(seq_out)

# 📌 Convert to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

# 📌 One-hot encode output labels
y_train = to_categorical(y_train, num_classes=len(tokenizer.word_index) + 1)  # +1 for zero-indexing

# 📌 Reshape input for CNN: [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], SEQ_LENGTH, 1))

# 📌 Define the function to build the CNN model with ResNet and Strides
def residual_block(x, filters, kernel_size, strides=1):
    """Residual block with Conv1D, BatchNorm, and LeakyReLU"""
    shortcut = x

    # First conv layer
    y = Conv1D(filters=filters, kernel_size=kernel_size, strides=strides, padding='same')(x)
    y = BatchNormalization()(y)
    y = LeakyReLU(alpha=0.2)(y)

    # Second conv layer
    y = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, padding='same')(y)  # Keep spatial size
    y = BatchNormalization()(y)

    # Adjust shortcut if needed
    if x.shape[-1] != filters or strides > 1:
        shortcut = Conv1D(filters=filters, kernel_size=1, strides=strides, padding='same')(x)
        shortcut = BatchNormalization()(shortcut)

    # Merge and activate
    y = Add()([shortcut, y])
    y = LeakyReLU(alpha=0.2)(y)

    return y

def build_resnet_model(input_shape, num_classes):
    input_layer = Input(shape=input_shape)

    x = residual_block(input_layer, filters=32, kernel_size=3, strides=2)
    x = residual_block(x, filters=64, kernel_size=3, strides=2)
    x = residual_block(x, filters=128, kernel_size=3, strides=2)
    x = residual_block(x, filters=256, kernel_size=3, strides=2)

    x = Flatten()(x)  # Better than Flatten for generalization
    x = Dense(128, activation='relu')(x)
    output_layer = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

# 📌 Build and train the model
model = build_resnet_model(input_shape=(SEQ_LENGTH, 1), num_classes=len(tokenizer.word_index) + 1)

# 📌 Train model (increase epochs for better results)
model.fit(X_train, y_train, epochs=50, batch_size=256, verbose=1)

# 📌 Function to generate continuous text
def generate_text(start_sequence, num_tokens=200):
    sequence = tokenizer.texts_to_sequences([start_sequence.lower()])[0]
    if len(sequence) < SEQ_LENGTH:
        return "Input too short!"

    generated_text = start_sequence
    for _ in range(num_tokens):
        X_test = np.array([sequence[-SEQ_LENGTH:]]).reshape((1, SEQ_LENGTH, 1))
        prediction = model.predict(X_test, verbose=0)
        next_token = np.argmax(prediction)
        next_word = tokenizer.index_word.get(next_token, '')  # Retrieve the word from token index
        generated_text += ' ' + next_word
        sequence.append(next_token)

    return generated_text

# 📌 Example usage
start_text = conversation_text[-SEQ_LENGTH-12:]  # Use the last part of the conversation
predicted_text = generate_text(start_text, num_tokens=200)
print(f"Generated text:\n{predicted_text}")


Text length: 277916
Epoch 1/50
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 418ms/step - accuracy: 0.0836 - loss: 7.0909
Epoch 2/50
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 410ms/step - accuracy: 0.1035 - loss: 6.3951
Epoch 3/50
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 419ms/step - accuracy: 0.1045 - loss: 6.1508
Epoch 4/50
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 426ms/step - accuracy: 0.1070 - loss: 5.8137
Epoch 5/50
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 413ms/step - accuracy: 0.1075 - loss: 5.3637
Epoch 6/50
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 407ms/step - accuracy: 0.1256 - loss: 4.7275
Epoch 7/50
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 413ms/step - accuracy: 0.2281 - loss: 3.9695
Epoch 8/50
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 409ms/step - accuracy: 0.2948 - loss: 3.4

In [None]:
model.fit(X_train, y_train, epochs=15, batch_size=256, verbose=1)

# save model
model.save('model.keras')

Epoch 1/15
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 413ms/step - accuracy: 1.0000 - loss: 0.0013
Epoch 2/15
[1m 63/175[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m45s[0m 404ms/step - accuracy: 1.0000 - loss: 0.0011

In [19]:
# 📌 Example usage
print(SEQ_LENGTH)
start_text = conversation_text[-SEQ_LENGTH-800:]
print(start_text)
print(len(start_text))
predicted_text = generate_text(start_text, num_tokens=400)
print(f"Generated text:\n{predicted_text}")

128
oject Gutenberg™ electronic works

Professor Michael S. Hart was the originator of the Project
Gutenberg™ concept of a library of electronic works that could be
freely shared with anyone. For forty years, he produced and
distributed Project Gutenberg™ eBooks with only a loose network of
volunteer support.

Project Gutenberg™ eBooks are often created from several printed
editions, all of which are confirmed as not protected by copyright in
the U.S. unless a copyright notice is included. Thus, we do not
necessarily keep eBooks in compliance with any particular paper
edition.

Most people start at our website which has the main PG search
facility: www.gutenberg.org.

This website includes information about Project Gutenberg™,
including how to make donations to the Project Gutenberg Literary
Archive Foundation, how to help produce our new eBooks, and how to
subscribe to our email newsletter to hear about new eBooks.



928
Generated text:
oject Gutenberg™ electronic works

Professor Mi