<a href="https://colab.research.google.com/github/Koks-creator/TextGenerationModel/blob/main/TextGenModelFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**1. Download data and create datasets**

---




In [None]:
import tensorflow as tf
from tensorflow.python.framework.ops import EagerTensor
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from typing import Union, Tuple
import matplotlib.pyplot as plt
import pickle

In [None]:
# "https://www.gutenberg.org/files/1342/1342-0.txt",  # Pride and Prejudice
# "https://www.gutenberg.org/files/84/84-0.txt",      # Frankenstein
# "https://www.gutenberg.org/files/1661/1661-0.txt",  # Sherlock Holmes
dataset_url = "https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/train.txt" # My brother in Yakub, this dataset can fuck your RAM up (it's large)
filepath = tf.keras.utils.get_file(f"wikitext2.txt", dataset_url)

In [None]:
# PARAMS
AUTOTUNE = tf.data.AUTOTUNE
VAL_TEST_SIZE = .2
TEST_SIZE = .5
PREFIX = "wikitext2"

# HYPER PARAMS
RNN_UNITS_LIST = (256, 256)
RNN_LAYERS = 2
SEQ_LENGTH = 40 # len of context
BATCH_SIZE = 32
EMBEDDING_DIM = 512
EPOCHS = 10

In [None]:
text = open(filepath, 'rb').read().decode(encoding='utf-8')
text = text[0:len(text) // 3]
print(f"len: {len(text):,} chars")
print(text[:500])

Długość: 3,593,479 znaków
 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs paralle


create vocabulary

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Unique chars: {vocab_size}")

Unikalne znaki: 186


In [None]:
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

In [None]:
text_as_int = np.array([char_to_idx[c] for c in text])

In [None]:
def create_training_data(text: np.ndarray, seq_length: int):
    """
    Creates pairs (input, target) where:
    input:  “Hello world”
    target: “ello world”
    """
    examples = []
    targets = []

    for i in range(0, len(text) - seq_length):
        examples.append(text[i:i + seq_length])
        targets.append(text[i + 1:i + seq_length + 1])

    return np.array(examples), np.array(targets)

X, y = create_training_data(text_as_int, SEQ_LENGTH)

In [None]:
X_train, X_val_test, y_train, y_val_test = train_test_split(
    X, y, test_size=VAL_TEST_SIZE, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_val_test, y_val_test, test_size=TEST_SIZE, random_state=42
)

In [None]:
print(len(X_train), len(y_train))
print(len(X_val), len(y_val))
print(len(X_test), len(y_test))

4312142 4312142
539018 539018
539018 539018


In [None]:
print("".join([idx_to_char[i] for i in y[1]]))
print("".join([idx_to_char[i] for i in X[1]]))

 = Valkyria Chronicles III = 
 
 Senjō n

 = Valkyria Chronicles III = 
 
 Senjō 


In [None]:
dataset_train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
dataset_train = dataset_train.shuffle(10000).batch(BATCH_SIZE).cache().prefetch(AUTOTUNE)

dataset_val = tf.data.Dataset.from_tensor_slices((X_val, y_val))
dataset_val = dataset_val.shuffle(10000).batch(BATCH_SIZE).cache().prefetch(AUTOTUNE)

dataset_test = tf.data.Dataset.from_tensor_slices((X_test, X_test))
dataset_test = dataset_test.shuffle(10000).batch(BATCH_SIZE).cache().prefetch(AUTOTUNE)

**2. Build model**

---



In [None]:
"""
    Character-level RNN for text generation.

    How generation works:

    START: “Hello”

    Iteration 1:
      Input: “Hello” → Model → Predictions for each position
      We take the last [-1] → Probabilities of the next character
      We draw: ‘ ’
      Result: “Hello ”

    Iteration 2:
      Input: ‘ ’ (+ hidden states) → Model → Prediction
      We draw: 'w'
      Result: “Hello w”

    Iteration 3:
      Input: ‘w’ (+ hidden states) → Model → Prediction
      We draw: ‘o’
      Result: “Hello wo”

    ...and so on until generation_length...

    RESULT: "Hello world! This is generated..."

    Key points:
    - The first iteration processes the entire start_string
    - Subsequent iterations: only 1 character + hidden states from the previous step (the model “remembers” the context through LSTM states, not by reprocessing the entire text).
    - Temperature controls randomness (lower = more predictable)
"""

class CharRNN(tf.keras.Model):
    def __init__(self,
                 vocab_size: int,
                 embedding_dim: int = 128,
                 rnn_units: tuple = (128, 64),
                 num_layers: int = 2):
        super().__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.num_layers = num_layers

        self.lstm_layers = []
        for i, rnn_unit in zip(range(num_layers), rnn_units):
            self.lstm_layers.append(
                tf.keras.layers.LSTM(
                    rnn_unit,
                    return_sequences=True,
                    return_state=True,
                    dropout=0.4
                )
            )

        self.dense = tf.keras.layers.Dense(vocab_size)

        self._built = True

    def call(self, inputs, training=False):
        """for training"""
        x = self.embedding(inputs, training=training)
        for lstm in self.lstm_layers:
            x, _, _ = lstm(x, training=training)
        return self.dense(x, training=training)

    def generate_step(self, inputs, states=None):
        """for generating"""
        x = self.embedding(inputs, training=False)

        all_states = []
        for i, lstm in enumerate(self.lstm_layers):
            if states:
                h_idx = i * 2
                c_idx = i * 2 + 1
                if c_idx < len(states):
                    layer_states = [states[h_idx], states[c_idx]]
                else:
                    layer_states = None
            else:
                layer_states = None

            x, h, c = lstm(x, initial_state=layer_states)
            all_states.extend([h, c])

        return self.dense(x), all_states

    def build(self, input_shape):
        """
        prevent:
        'UserWarning: `build()` was called on layer 'char_rnn_7', however the layer does not have a `build()` method implemented and it looks like it has unbuilt state'
        """
        if self._built:
            return

        super().build(input_shape)

        # Simulate forward pass to build layers
        # Use dummy input with the correct shape
        batch_size = input_shape[0] or 1
        seq_len = input_shape[1] or 10

        dummy_input = tf.zeros((batch_size, seq_len), dtype=tf.int32)

        # Pass through the layers to build them
        x = self.embedding(dummy_input)
        for lstm in self.lstm_layers:
            x, _, _ = lstm(x)
        self.dense(x)

        self._built = True


    def generate(self, start_string: str, generation_length: int = 100, temperature: float = 1.0) -> str:
        """
        Generates text character by character

        temperature: controls “creativity”
        - 0.5 = more predictable
        - 1.0 = balanced
        - 2.0 = more random
        """

        # Convert start string to indexes
        input_eval = [char_to_idx[s] for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)

        generated = []
        states = None

        for i in range(generation_length):
            predictions, states = self.generate_step(input_eval, states=states)

            # Remove batch dim
            predictions = tf.squeeze(predictions, 0)
            # Use last prediction
            predictions = predictions[-1, :] / temperature

            # Sample from distribution (not argmax!)
            # Sample from distribution will draw an additional character based on softmax
            # Randomly pick next char (higher logit = higher chance)
            predicted_id = tf.random.categorical(
                tf.expand_dims(predictions, 0),
                num_samples=1
            )[-1, 0].numpy()

            # The next input is the generated character
            input_eval = tf.expand_dims([predicted_id], 0)

            generated.append(idx_to_char[predicted_id])

        return start_string + ''.join(generated)

In [None]:
model = CharRNN(
    vocab_size=vocab_size,
    embedding_dim=EMBEDDING_DIM,
    rnn_units=RNN_UNITS_LIST,
    num_layers=RNN_LAYERS
    )

256
256


In [None]:
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

**3. Train model**

---



In [None]:
history = model.fit(
    dataset_train,
    validation_data=dataset_val,
    epochs=EPOCHS,
    # callbacks=[
    #     tf.keras.callbacks.ModelCheckpoint('best_model.h5', save_best_only=True),
    #     tf.keras.callbacks.EarlyStopping(patience=10),
    #     tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
    # ]
)

Epoch 1/10
[1m89836/89836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1039s[0m 11ms/step - accuracy: 0.5402 - loss: 1.5850 - val_accuracy: 0.6203 - val_loss: 1.2773
Epoch 2/10
[1m89836/89836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1026s[0m 11ms/step - accuracy: 0.6030 - loss: 1.3399 - val_accuracy: 0.6274 - val_loss: 1.2499
Epoch 3/10
[1m89836/89836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1067s[0m 12ms/step - accuracy: 0.6087 - loss: 1.3174 - val_accuracy: 0.6307 - val_loss: 1.2386
Epoch 4/10
[1m89836/89836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1014s[0m 11ms/step - accuracy: 0.6114 - loss: 1.3070 - val_accuracy: 0.6318 - val_loss: 1.2335
Epoch 5/10
[1m89836/89836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1015s[0m 11ms/step - accuracy: 0.6129 - loss: 1.3013 - val_accuracy: 0.6330 - val_loss: 1.2285
Epoch 6/10
[1m89836/89836[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1017s[0m 11ms/step - accuracy: 0.6138 - loss: 1.2976 - val_accuracy: 0.6338

KeyboardInterrupt: 

**4. Show training history**

---



In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Loss history")
plt.ylabel("Value")
plt.xlabel("Epoch")
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.title("accuracy history")
plt.ylabel("Value")
plt.xlabel("Epoch")
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

NameError: name 'history' is not defined

**5. Test and save artifacts**

---



In [None]:
for temp in [0.5]:
    print(f"\nTemperature = {temp}:")
    generated = model.generate(
        start_string="Come and see",
        generation_length=100,
        temperature=temp
    )
    print(generated)


Temperature = 0.5:
Come and seen a company of the city . The prediction that the first form of the area of the Catechism . 
 
 = = 


In [None]:
with open(f"{PREFIX}_char_to_idx.pkl", "wb") as w2i_f:
    pickle.dump(char_to_idx, w2i_f)

with open(f"{PREFIX}_idx_to_char.pkl", "wb") as w2i_f:
    pickle.dump(idx_to_char, w2i_f)

In [None]:
weights = model.get_weights()
np.savez(f"{PREFIX}_numpy_weights.npz", *weights)

with open(f"{PREFIX}_config.pkl", "wb") as f:
    pickle.dump({
        'char_to_idx': char_to_idx,
        'idx_to_char': idx_to_char,
        'vocab_size': vocab_size,
        'embedding_dim': EMBEDDING_DIM,
        'rnn_units': RNN_UNITS_LIST,
        'num_layers': RNN_LAYERS
    }, f)

**6. Read artifcats, build model and test again**

In [None]:
with open(f"{PREFIX}_char_to_idx.pkl", "rb") as f:
    char_to_idx = pickle.load(f)

with open(f"{PREFIX}_idx_to_char.pkl", "rb") as f:
    idx_to_char = pickle.load(f)

with open(f"{PREFIX}_config.pkl", "rb") as f:
    config = pickle.load(f)

In [None]:
with open(f"{PREFIX}_config.pkl", "rb") as f:
    config = pickle.load(f)

model_new = CharRNN(
    vocab_size=config["vocab_size"],
    embedding_dim=config["embedding_dim"],
    rnn_units=config["rnn_units"],
    num_layers=config["num_layers"]
)

dummy_input = tf.zeros((1, 10), dtype=tf.int32)
_ = model_new(dummy_input)

loaded = np.load(f"{PREFIX}_numpy_weights.npz")
weights = [loaded[f'arr_{i}'] for i in range(len(loaded.files))]

model_new.set_weights(weights)

for temp in [.5]:
    print(f"\nTemperature = {temp}:")
    generated = model_new.generate(
        start_string="those who baelieve",
        generation_length=100,
        temperature=temp
    )
    print(generated)

256
256
Liczba wag w modelu: 9
Liczba wag do wczytania: 9

Temperature = 0.5:
those who baelieve my labours and the extreme delight of the events of the only crimes, and
I felt a lovely discovery 
