In [6]:
import json
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [7]:
texts = []  # Сюда будем сохранять обработанные тексты
with open('output.json', encoding='utf-8') as file:
    # Извлечение и преобразование данных
    for line in file.readlines():
        item = json.loads(line)

        user_input = item['request'][1]['text']
        response = item['response']

        # Извлечение жанра и первой строки
        genre_line = user_input.split('\n')[1]  # Предполагается, что жанр указан во второй строке после "Первая строка: "
        full_text = f"{genre_line}\n{response}"

        texts.append(full_text)

In [8]:
# Токенизация текста
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
total_words = len(tokenizer.word_index) + 1

# Создание последовательностей
input_sequences = []
for text in texts:
    token_list = tokenizer.texts_to_sequences([text])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Дополнение последовательностей
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [9]:
# Создание предикторов и метки
X, labels = input_sequences[:,:-1], input_sequences[:,-1]
y = to_categorical(labels, num_classes=total_words)

In [None]:
# Построение модели
model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(150, return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.regularizers import l2

# Построение модели
model_new_structure = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(100, return_sequences=True, kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    LSTM(50, kernel_regularizer=l2(0.01)),
    Dense(total_words, activation='softmax')
])

model_new_structure.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_new_structure.summary()

In [3]:
current_model = model

In [11]:
# Обучение модели
current_model.fit(X, y, epochs=20, verbose=1)

2024-04-12 22:21:24.977319: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 28683643040 exceeds 10% of free system memory.
2024-04-12 22:21:59.519907: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 28683643040 exceeds 10% of free system memory.


Epoch 1/20


2024-04-12 22:22:22.317507: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-12 22:22:22.322273: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-04-12 22:22:22.325805: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

  85/7456 [..............................] - ETA: 36:39 - loss: 1.4320 - accuracy: 0.6952

KeyboardInterrupt: 

In [9]:
current_model.save('model_new_structure_.h5')

In [None]:
# Сохранение токенизатора
import pickle
with open('tokenizer2.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
# Загрузка модели
from tensorflow.keras.models import load_model
model = load_model('model.h5')

2024-04-12 22:16:10.372783: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-12 22:16:13.900571: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31136 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:8c:00.0, compute capability: 7.0
2024-04-12 22:16:14.983141: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-04-12 22:16:14.986416: I tensor

In [4]:
def generate_text(model, tokenizer, start_text, num_words):
    for _ in range(num_words):
        sequence = tokenizer.texts_to_sequences([start_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(sequence, verbose=0)
        predicted_index = np.argmax(predicted, axis=-1)[0]
        new_word = ''

        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                new_word = word
                break
        
        start_text += ' ' + new_word
    return start_text

In [12]:
text = "Жанр: Рэп\nК Ашоту залетаем, шашлык дымит нормально"
generated_text = generate_text(current_model, tokenizer, text, 30)
print(generated_text)

Жанр: Рэп
К Ашоту залетаем, шашлык дымит нормально на мне сидит орёл он написал тебе потому что я наверху и я знаю кто знает кто выжил такие лоот — это близзард и на мне жидкий кристалл — я
