In [32]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [33]:
with open('okulary.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()

In [34]:
text

'biega, krzyczy pan hilary:\ngdzie są moje okulary?\nszuka w spodniach i w surducie,\nw prawym bucie, w lewym bucie.\nwszystko w szafach poprzewracał,\nmaca szlafrok, palto maca.\nskandal! – krzyczy - nie do wiary!\nktoś mi ukradł okulary!\npod kanapą, na kanapie,\nwszędzie szuka, parska, sapie!\nszpera w piecu i w kominie,\nw mysiej dziurze i w pianinie.\njuż podłogę chce odrywać,\njuż policję zaczął wzywać.\nnagle - zerknął do lusterka\nnie chce wierzyć znowu zerka.\nznalazł! są! okazało się,\nże je ma na własnym nosie. '

In [35]:
tokenizer=Tokenizer()

In [36]:
tokenizer.fit_on_texts([text])

In [37]:
total_words=len(tokenizer.word_index)+1

In [38]:
total_words

65

In [39]:
tokenizer.word_index

{'w': 1,
 'i': 2,
 'krzyczy': 3,
 'są': 4,
 'okulary': 5,
 'szuka': 6,
 'bucie': 7,
 'maca': 8,
 'nie': 9,
 'do': 10,
 'na': 11,
 'już': 12,
 'chce': 13,
 'biega': 14,
 'pan': 15,
 'hilary': 16,
 'gdzie': 17,
 'moje': 18,
 'spodniach': 19,
 'surducie': 20,
 'prawym': 21,
 'lewym': 22,
 'wszystko': 23,
 'szafach': 24,
 'poprzewracał': 25,
 'szlafrok': 26,
 'palto': 27,
 'skandal': 28,
 '–': 29,
 'wiary': 30,
 'ktoś': 31,
 'mi': 32,
 'ukradł': 33,
 'pod': 34,
 'kanapą': 35,
 'kanapie': 36,
 'wszędzie': 37,
 'parska': 38,
 'sapie': 39,
 'szpera': 40,
 'piecu': 41,
 'kominie': 42,
 'mysiej': 43,
 'dziurze': 44,
 'pianinie': 45,
 'podłogę': 46,
 'odrywać': 47,
 'policję': 48,
 'zaczął': 49,
 'wzywać': 50,
 'nagle': 51,
 'zerknął': 52,
 'lusterka': 53,
 'wierzyć': 54,
 'znowu': 55,
 'zerka': 56,
 'znalazł': 57,
 'okazało': 58,
 'się': 59,
 'że': 60,
 'je': 61,
 'ma': 62,
 'własnym': 63,
 'nosie': 64}

In [40]:
input_sequences=[]

In [41]:
token_list=tokenizer.texts_to_sequences([text])[0]

In [42]:
token_list

[14,
 3,
 15,
 16,
 17,
 4,
 18,
 5,
 6,
 1,
 19,
 2,
 1,
 20,
 1,
 21,
 7,
 1,
 22,
 7,
 23,
 1,
 24,
 25,
 8,
 26,
 27,
 8,
 28,
 29,
 3,
 9,
 10,
 30,
 31,
 32,
 33,
 5,
 34,
 35,
 11,
 36,
 37,
 6,
 38,
 39,
 40,
 1,
 41,
 2,
 1,
 42,
 1,
 43,
 44,
 2,
 1,
 45,
 12,
 46,
 13,
 47,
 12,
 48,
 49,
 50,
 51,
 52,
 10,
 53,
 9,
 13,
 54,
 55,
 56,
 57,
 4,
 58,
 59,
 60,
 61,
 62,
 11,
 63,
 64]

In [43]:
for i in range(4, len(token_list)):
    n_gram_sequences=token_list[i-4:i+1]
    input_sequences.append(n_gram_sequences)

In [44]:
input_sequences

[[14, 3, 15, 16, 17],
 [3, 15, 16, 17, 4],
 [15, 16, 17, 4, 18],
 [16, 17, 4, 18, 5],
 [17, 4, 18, 5, 6],
 [4, 18, 5, 6, 1],
 [18, 5, 6, 1, 19],
 [5, 6, 1, 19, 2],
 [6, 1, 19, 2, 1],
 [1, 19, 2, 1, 20],
 [19, 2, 1, 20, 1],
 [2, 1, 20, 1, 21],
 [1, 20, 1, 21, 7],
 [20, 1, 21, 7, 1],
 [1, 21, 7, 1, 22],
 [21, 7, 1, 22, 7],
 [7, 1, 22, 7, 23],
 [1, 22, 7, 23, 1],
 [22, 7, 23, 1, 24],
 [7, 23, 1, 24, 25],
 [23, 1, 24, 25, 8],
 [1, 24, 25, 8, 26],
 [24, 25, 8, 26, 27],
 [25, 8, 26, 27, 8],
 [8, 26, 27, 8, 28],
 [26, 27, 8, 28, 29],
 [27, 8, 28, 29, 3],
 [8, 28, 29, 3, 9],
 [28, 29, 3, 9, 10],
 [29, 3, 9, 10, 30],
 [3, 9, 10, 30, 31],
 [9, 10, 30, 31, 32],
 [10, 30, 31, 32, 33],
 [30, 31, 32, 33, 5],
 [31, 32, 33, 5, 34],
 [32, 33, 5, 34, 35],
 [33, 5, 34, 35, 11],
 [5, 34, 35, 11, 36],
 [34, 35, 11, 36, 37],
 [35, 11, 36, 37, 6],
 [11, 36, 37, 6, 38],
 [36, 37, 6, 38, 39],
 [37, 6, 38, 39, 40],
 [6, 38, 39, 40, 1],
 [38, 39, 40, 1, 41],
 [39, 40, 1, 41, 2],
 [40, 1, 41, 2, 1],
 [1, 41, 2, 1

In [45]:
input_sequences = np.array(input_sequences)

In [46]:
X = input_sequences[:, :-1]

In [47]:
y = input_sequences[:, -1]

In [48]:
print(input_sequences[34])

[31 32 33  5 34]


In [49]:
print(y[34])

34


In [50]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [51]:
y[23]

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [52]:
model=Sequential()
model.add(Embedding(input_dim=total_words, output_dim=64, input_length=X.shape[1]))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))

In [53]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [54]:
model.fit(X,y,epochs=100,verbose=1)

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.0179 - loss: 4.1747
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.1203 - loss: 4.1660
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1438 - loss: 4.1575
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1203 - loss: 4.1495
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1298 - loss: 4.1392 
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1220 - loss: 4.1282
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1220 - loss: 4.1139
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1259 - loss: 4.0914
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7ac5d9e11820>

In [55]:
def generate_text(seed_text, next_words=10):
    result=seed_text
    for _ in range(next_words):
        token_list=tokenizer.texts_to_sequences([result])[0]
        token_list=pad_sequences([token_list], maxlen=X.shape[1], padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]
        for word, index in tokenizer.word_index.items():
          if index==predicted:
            result=' '+word
            break
    return result

In [58]:
seed_text = input('Podaj początek tekstu: minimum 4 wyrazy')
print('\n Wygenerowany tekst: \n')
print(generate_text(seed_text,next_words=5))

Podaj początek tekstu: minimum 4 wyrazyszafach

 Wygenerowany tekst: 

 maca
