In [1]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense

# Tokenizer initialization
tokenizer = Tokenizer()


In [3]:
import keras

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Corpus length: 600893


In [4]:
def dataset_preparation(data):
    data = data.lower().split("\n")
    tokenizer.fit_on_texts(data)
    total_words = len(tokenizer.word_index) + 1

    # create input sequences using list of tokens
    input_sequences = []
    for line in data:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    # pad sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # create predictors and label
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len, total_words

In [5]:
predictors, label, max_sequence_len, total_words = dataset_preparation(text)

In [6]:
def create_model(predictors, label, max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=input_len))
    model.add(LSTM(150))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.fit(predictors, label, epochs=10, verbose=1)
    print(model.summary())
    return model

In [7]:
model = create_model(predictors, label, max_sequence_len, total_words)



Epoch 1/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 31ms/step - loss: 6.8532
Epoch 2/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 31ms/step - loss: 6.2841
Epoch 3/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 31ms/step - loss: 6.0612
Epoch 4/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 31ms/step - loss: 5.8661
Epoch 5/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 31ms/step - loss: 5.6481
Epoch 6/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 36ms/step - loss: 5.4297
Epoch 7/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 31ms/step - loss: 5.2601
Epoch 8/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 30ms/step - loss: 5.0496
Epoch 9/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 30ms/step - loss: 4.8494
Epoch 10/10
[1m2886/2886[0m [32m━━━━━━━━━━━━━━━━

None


In [8]:
def apply_temperature(preds, temperature=1.0):
    if temperature == 0:
        temperature = 1e-7  # Avoid division by zero
    preds = np.log(preds + 1e-9) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return preds

In [9]:
def generate_text(seed_text, next_words, max_sequence_len, model, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        preds = model.predict(token_list, verbose=0)[0]

        # Apply temperature scaling
        scaled_preds = apply_temperature(preds, temperature)

        # Sample the next word from the scaled probability distribution
        next_word_index = np.random.choice(len(scaled_preds), p=scaled_preds)
        
        # Find the word corresponding to the predicted index
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == next_word_index:
                output_word = word
                break

        seed_text += " " + output_word
    return seed_text

In [12]:
# Example usage with correct argument types
seed_text = "new faculty, and the jubilation reached its climax when kant."
next_words = 50  # The number of words to generate (must be an integer)
temperature = 0.8

generated_text = generate_text(seed_text, next_words=next_words, max_sequence_len=max_sequence_len, model=model, temperature=temperature)
print(generated_text)


new faculty, and the jubilation reached its climax when kant. is always still so a imaginations of humanity as he could does seem to do a thinker of knowledge as a sudden than of life and extraordinarily we have too among means of the history of sympathy and the eternal word the faith be the takes his cruelty as germans
