# Задача: определить настроение заданного текста(ов)

## Обучение модели

In [2]:
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
import numpy as np

In [3]:
# Получение отзывов со словами, которые входят в число 5000 наиболее часто встречающихся слов во всем корпусе текстовых данных отзывов.
vocab_size = 5000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) 
 
print(x_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]


In [4]:
# Получаем все слова из словаря word_index
word_idx = imdb.get_word_index()

# Первоначально порядковый номер значения, а не ключа, поэтому индекс преобразуется в ключ, а слова в значения
word_idx = {i: word for word, i in word_idx.items()}

# Распечатываем ревью
print([word_idx[i] for i in x_train[0]])

['the', 'as', 'you', 'with', 'out', 'themselves', 'powerful', 'lets', 'loves', 'their', 'becomes', 'reaching', 'had', 'journalist', 'of', 'lot', 'from', 'anyone', 'to', 'have', 'after', 'out', 'atmosphere', 'never', 'more', 'room', 'and', 'it', 'so', 'heart', 'shows', 'to', 'years', 'of', 'every', 'never', 'going', 'and', 'help', 'moments', 'or', 'of', 'every', 'chest', 'visual', 'movie', 'except', 'her', 'was', 'several', 'of', 'enough', 'more', 'with', 'is', 'now', 'current', 'film', 'as', 'you', 'of', 'mine', 'potentially', 'unfortunately', 'of', 'you', 'than', 'him', 'that', 'with', 'out', 'themselves', 'her', 'get', 'for', 'was', 'camp', 'of', 'you', 'movie', 'sometimes', 'movie', 'that', 'with', 'scary', 'but', 'and', 'to', 'story', 'wonderful', 'that', 'in', 'seeing', 'in', 'character', 'to', 'of', '70s', 'and', 'with', 'heart', 'had', 'shadows', 'they', 'of', 'here', 'that', 'with', 'her', 'serious', 'to', 'have', 'does', 'when', 'from', 'why', 'what', 'have', 'critics', 'they'

In [5]:
# Получаем минимальную и максимальную длину отзывов
print("Max length of a review:: ", len(max((x_train+x_test), key=len)))
print("Min length of a review:: ", len(min((x_train+x_test), key=len)))

Max length of a review::  2697
Min length of a review::  70


In [6]:
from tensorflow.keras.preprocessing import sequence
 
# Сохранение фиксированной длины всех отзывов (максимум 400 слов).
max_words = 400
 
x_train = sequence.pad_sequences(x_train, maxlen=max_words)
x_test = sequence.pad_sequences(x_test, maxlen=max_words)
 
x_valid, y_valid = x_train[:64], y_train[:64]
x_train_, y_train_ = x_train[64:], y_train[64:]

In [7]:
embd_len = 32
lstm_model = Sequential(name="LSTM_Model")
lstm_model.add(Embedding(vocab_size,
                         embd_len,
                         input_length=max_words))
lstm_model.add(LSTM(128,
                    activation='tanh',
                    return_sequences=False))
lstm_model.add(Dense(1, activation='sigmoid'))
 
print(lstm_model.summary())

lstm_model.compile(
    loss="binary_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)
 
history3 = lstm_model.fit(x_train_, y_train_,
                          batch_size=64,
                          epochs=5,
                          verbose=2,
                          validation_data=(x_valid, y_valid))
 
print()
print("LSTM model Score---> ", lstm_model.evaluate(x_test, y_test, verbose=0))

Model: "LSTM_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 32)           160000    
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 242561 (947.50 KB)
Trainable params: 242561 (947.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/5
390/390 - 186s - loss: 0.4929 - accuracy: 0.7529 - val_loss: 0.3086 - val_accuracy: 0.9062 - 186s/epoch - 477ms/step
Epoch 2/5
390/390 - 203s - loss: 0.3081 - accuracy: 0.8743 - val_loss: 0.2824 - val_accuracy: 0.8594 - 203s/epoch - 520ms/step
Epoch 3/5
390/390 - 2

In [17]:
# Сохранение модели в файл
lstm_model.save("my_model.keras")

## Тестирование работы модели на кокретных текстах

### Подготовка данных

In [34]:
from keras.models import load_model

# Загрузка модели из файла
model = load_model("my_model.keras")

In [55]:
with open('text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [56]:
# Разделение текста на отдельные блоки по пустым строкам
text = text.lower()
text_blocks = text.split('\n\n')

text_blocks = [block.strip() for block in text_blocks if block.strip()]

for block in text_blocks:
    print(block)
    print("=" * 50)

but we were expecting something completely different...
using the example of “madagascar” and “how to train your dragon,” we saw that the dreamworks studio had the courage to stop in time, which is why, perhaps, no one expected a continuation of poe’s story. however, here it is in 2024, and the fourth part of “kung fu panda” is coming out, which must correspond, because it will not be able to avoid comparison with the previous ones. the first one is still the strongest for me, but the sequels at one time were able to maintain a sufficient level of quality to preserve warm feelings for the characters, to capture this fabulous, but so living world of martial arts. was the release of the fourth part justified? - of course, the audience managed to miss their favorite characters, eight whole years have passed. but was she able to live up to these expectations? did the studio treat its own franchise with the same awe that the audience had? - good question...
plot. dragon warrior is just a ti

In [57]:
def simple_tokenize(text):

    for punctuation in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~“”':
        text = text.replace(punctuation, ' ')
    
    # Разделяем текст по пробелам и удаляем пустые строки
    tokens = [token for token in text.split() if token.strip()]
    
    return tokens

In [58]:
tokenized_text_blocks = []
 
for block in text_blocks:
    tokens = simple_tokenize(block)
    tokenized_text_blocks.append(tokens)

# Вывод токенизированных блоков
for tokens in tokenized_text_blocks:
    print(tokens)
    print("=" * 50)

['but', 'we', 'were', 'expecting', 'something', 'completely', 'different', 'using', 'the', 'example', 'of', 'madagascar', 'and', 'how', 'to', 'train', 'your', 'dragon', 'we', 'saw', 'that', 'the', 'dreamworks', 'studio', 'had', 'the', 'courage', 'to', 'stop', 'in', 'time', 'which', 'is', 'why', 'perhaps', 'no', 'one', 'expected', 'a', 'continuation', 'of', 'poe’s', 'story', 'however', 'here', 'it', 'is', 'in', '2024', 'and', 'the', 'fourth', 'part', 'of', 'kung', 'fu', 'panda', 'is', 'coming', 'out', 'which', 'must', 'correspond', 'because', 'it', 'will', 'not', 'be', 'able', 'to', 'avoid', 'comparison', 'with', 'the', 'previous', 'ones', 'the', 'first', 'one', 'is', 'still', 'the', 'strongest', 'for', 'me', 'but', 'the', 'sequels', 'at', 'one', 'time', 'were', 'able', 'to', 'maintain', 'a', 'sufficient', 'level', 'of', 'quality', 'to', 'preserve', 'warm', 'feelings', 'for', 'the', 'characters', 'to', 'capture', 'this', 'fabulous', 'but', 'so', 'living', 'world', 'of', 'martial', 'arts

In [59]:
def search_indexes(tokens):
    find_flag = False
    indexes = []
    for word in tokens:
        for index, words in word_idx.items():
            if word == words:
                if index < 5000:
                    indexes.append(index)
                else:
                    indexes.append(1)
                find_flag = True
                break
        if not find_flag:
            indexes.append(1)
        
    return indexes

for tokens in tokenized_text_blocks:
    print(search_indexes(tokens))
    print("=" * 50)

[18, 72, 68, 1014, 139, 337, 272, 769, 1, 460, 4, 1, 2, 86, 5, 1371, 126, 2782, 72, 216, 12, 1, 1, 1179, 66, 1, 3155, 5, 567, 8, 55, 60, 6, 135, 379, 54, 28, 870, 3, 1, 4, 62, 187, 130, 9, 6, 8, 2, 1, 2767, 170, 4, 2132, 1876, 1, 6, 579, 43, 60, 212, 1, 85, 9, 77, 21, 27, 499, 5, 795, 2093, 16, 1, 957, 660, 1, 83, 28, 6, 128, 1, 1, 15, 69, 18, 1, 2286, 30, 28, 55, 68, 499, 5, 4557, 3, 1, 648, 4, 486, 5, 1, 2269, 1414, 15, 1, 102, 5, 1836, 11, 2723, 18, 35, 578, 179, 4, 1644, 1730, 13, 1, 763, 4, 1, 2767, 170, 1, 4, 262, 1, 308, 1316, 5, 714, 65, 511, 102, 2307, 223, 150, 25, 2113, 18, 13, 56, 499, 5, 409, 53, 5, 131, 1395, 119, 1, 1179, 1691, 91, 202, 3132, 16, 1, 169, 4296, 12, 1, 308, 66, 49, 885, 111, 2782, 3742, 6, 40, 3, 422, 2113, 36, 28, 4022, 5, 157, 1614, 1297, 1, 1, 1, 5, 1, 4753, 4, 1, 1, 1, 5, 3, 1514, 1, 134, 653, 1, 214, 4, 3, 3577, 1, 187, 1, 1440, 1, 1, 1, 60, 6, 135, 26, 1, 11, 558, 953, 122, 20, 24, 233, 2787, 5, 1, 1, 442, 1, 1, 33, 1, 57, 213, 53, 16, 3, 400, 2247, 

In [60]:
indexes_list = []

for tokens in tokenized_text_blocks:
    indexes = search_indexes(tokens)
    indexes_list.append(indexes)

print(indexes_list)

[[18, 72, 68, 1014, 139, 337, 272, 769, 1, 460, 4, 1, 2, 86, 5, 1371, 126, 2782, 72, 216, 12, 1, 1, 1179, 66, 1, 3155, 5, 567, 8, 55, 60, 6, 135, 379, 54, 28, 870, 3, 1, 4, 62, 187, 130, 9, 6, 8, 2, 1, 2767, 170, 4, 2132, 1876, 1, 6, 579, 43, 60, 212, 1, 85, 9, 77, 21, 27, 499, 5, 795, 2093, 16, 1, 957, 660, 1, 83, 28, 6, 128, 1, 1, 15, 69, 18, 1, 2286, 30, 28, 55, 68, 499, 5, 4557, 3, 1, 648, 4, 486, 5, 1, 2269, 1414, 15, 1, 102, 5, 1836, 11, 2723, 18, 35, 578, 179, 4, 1644, 1730, 13, 1, 763, 4, 1, 2767, 170, 1, 4, 262, 1, 308, 1316, 5, 714, 65, 511, 102, 2307, 223, 150, 25, 2113, 18, 13, 56, 499, 5, 409, 53, 5, 131, 1395, 119, 1, 1179, 1691, 91, 202, 3132, 16, 1, 169, 4296, 12, 1, 308, 66, 49, 885, 111, 2782, 3742, 6, 40, 3, 422, 2113, 36, 28, 4022, 5, 157, 1614, 1297, 1, 1, 1, 5, 1, 4753, 4, 1, 1, 1, 5, 3, 1514, 1, 134, 653, 1, 214, 4, 3, 3577, 1, 187, 1, 1440, 1, 1, 1, 60, 6, 135, 26, 1, 11, 558, 953, 122, 20, 24, 233, 2787, 5, 1, 1, 442, 1, 1, 33, 1, 57, 213, 53, 16, 3, 400, 2247,

### ТЕСТ

1 - положительная рецензия, 0 - негативная

In [61]:
padded_sequences = pad_sequences(indexes_list, maxlen=max_words)

# Предсказание настроения текста
prediction = model.predict(padded_sequences)

print("Предсказанное настроение:", prediction)

Предсказанное настроение: [[0.00913457]
 [0.09803171]]
