<a href="https://colab.research.google.com/github/Galahexolion/Giliojo-mokymo-sistem-taikymai/blob/main/Lab8_LSTM/Lab8_LSTM_Teksto_generavimas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- 1. DUOMENŲ PARUOŠIMAS ---

import numpy as np
import random
import io
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Atsisiunčiame tekstą
path = keras.utils.get_file("nietzsche.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
with io.open(path, encoding="utf-8") as f:
    text = f.read().lower()
print(f"Teksto ilgis: {len(text)} simbolių")

# Sukuriame žodyną (simbolis -> skaičius)
chars = sorted(list(set(text)))
print(f"Unikalių simbolių: {len(chars)}")
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# Sukuriame sekas mokymui
maxlen = 40  # Sekos ilgis
step = 3     # Kas kiek simbolių imame naują seką
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print(f"Mokymo sekų skaičius: {len(sentences)}")

# Vektorizacija (One-Hot Encoding)
# Tai užima daug atminties, todėl imame tik dalį duomenų demonstracijai (pvz., pirmus 50000)
# Jei turite daug RAM, galite naudoti visus duomenis
limit = 50000
x = np.zeros((limit, maxlen, len(chars)), dtype=bool)
y = np.zeros((limit, len(chars)), dtype=bool)
for i, sentence in enumerate(sentences[:limit]):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
[1m600901/600901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Teksto ilgis: 600893 simbolių
Unikalių simbolių: 57
Mokymo sekų skaičius: 200285


In [2]:
# --- 2. LSTM MODELIS TEKSTUI ---

model_text = keras.Sequential([
    keras.Input(shape=(maxlen, len(chars))),
    layers.LSTM(128), # 128 neuronų LSTM sluoksnis
    layers.Dense(len(chars), activation="softmax") # Išėjimas - tikimybė kiekvienam simboliui
])

model_text.compile(loss="categorical_crossentropy", optimizer="adam")
model_text.summary()

In [3]:
# --- 3. TEKSTO GENERAVIMO FUNKCIJA ---

def sample(preds, temperature=1.0):
    # Funkcija atsitiktinumui įvesti (kad tekstas nebūtų monotoniškas)
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [4]:
# --- 4. MOKYMAS IR GENERAVIMAS ---

epochs = 5  # Sumažinta demonstracijai (originale 40)
batch_size = 128

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    model_text.fit(x, y, batch_size=batch_size, epochs=1)

    # Generuojame tekstą po kiekvienos epochos
    start_index = random.randint(0, len(text) - maxlen - 1)
    base_sentence = text[start_index : start_index + maxlen]
    print(f'--- Generuojama su pradžia: "{base_sentence}"')

    for diversity in [0.5, 1.0]: # Skirtingi "kūrybiškumo" lygiai
        print(f"\n--- Diversity (Temperatūra): {diversity}")
        generated = ""
        sentence = base_sentence

        # Generuojame 200 simbolių
        for i in range(200):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0

            preds = model_text.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char
            generated += next_char

        print(generated)


Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 112ms/step - loss: 3.1400
--- Generuojama su pradžia: " sometimes two suns which
determine the "

--- Diversity (Temperatūra): 0.5
is ane ans anlsthen inst oos wat on the tn res the tn ils ahessant in pd es ind ine an ae sail  icgalass ancto sts lis when eney antcne ie theute ne winen tae the til cilltthecat aons ghes the t in e 

--- Diversity (Temperatūra): 1.0
aantimy bhd aos arstsessstrp9fsof ffensmmes anslgmtf aog!oougln
thaeimed no bee alm ane  eoll noiotef dfiyitrhn'
ithlherstls toeq aa sengoftnl  thr tico riad pnn qc" 5 irt"monicsleduchosse
phd wns cu


Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 108ms/step - loss: 2.6385
--- Generuojama su pradžia: " they may be satisfied, the aristotelian"

--- Diversity (Temperatūra): 0.5
 and in ses ort ol ar s oan, on the thint ped ans of the ritil than in and hicl the the it anl on onlit of re pe thever in the ar ire en eres in t