In [4]:
# Reccurent Neural Network, which can make sentimental analysis of some phrases
# this task also could be done by usual dense neural network, but this example is
# intended to show that the solution to such a problem can be carried out in different ways

import numpy as np
import re

# here we're importing LSTM and GRU blocks from keras to compare them as methods for constructing RNN
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Embedding, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
# here we're importing tokenizator for text files
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
# here we opening files with positive and negative phrases as train data
# you may find those files (positive.txt and negative.txt) in same folder
with open('positive.txt', 'r', encoding='utf-8') as f:
    text_pos = f.readlines()
    text_pos[0] = text_pos[0].replace('\ufeff', '')


with open('negative.txt', 'r', encoding='utf-8') as f:
    text_neg = f.readlines()
    text_neg[0] = text_neg[0].replace('\ufeff', '')

In [16]:
# we combine positive and negative phrases into one collection to simplify their feeding to the neural network
phrases = text_pos + text_neg
count_pos = len(text_pos)
count_neg = len(text_neg)
total_lines = count_pos + count_neg
print(count_pos, count_neg , total_lines)

653 401 1054


In [29]:

# We break the texts into individual words (the maximum number of individual words "maxWordsCount" is 5000).
# Depending on your needs, you can change this parameter
maxWordsCount = 5000

# we should delate all extra symbols as '!–"—#$%&amp;()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r«»'
tokenizer = Tokenizer(num_words=maxWordsCount, filters='!–"—#$%&;()*+,-./:;<=>?@[\\]^_`{|}~«»', lower=True, split=' ', char_level=False)
tokenizer.fit_on_texts(phrases)

In [30]:

# create a “dictionary” object that will contain the word and the number of its occurrences in the text
dictionary = list(tokenizer.word_counts.items())


# here we display the first 10 words from the first phrase (its serial number is zero) and the number of these words
# in the text
print(dictionary[:10])
print(phrases[0][:100])

[('i', 481), ('love', 86), ('spending', 2), ('time', 8), ('with', 52), ('my', 266), ('family', 9), ('\n', 1053), ('the', 252), ('sun', 1)]
I love spending time with my family.



In [31]:
# set the length for each numeric vector into which all words will be converted
max_text_len = 10

# convert the “phrases” into a “data” object, which will be a vector where the words from the “phrases”
# have been converted into numeric values ​​in accordance with the “dictionary”
data = tokenizer.texts_to_sequences(phrases)

# data_pad - a set of vectors that were reduced to length in accordance with the "max_text_len" parameter
data_pad = pad_sequences(data, maxlen=max_text_len)

print(data_pad)


[[  0   0   2 ...   6 133   1]
 [  7 359  13 ...  13 264   1]
 [  0   0   2 ... 265  45   1]
 ...
 [  0   0   9 ...   3  86   1]
 [  0   0   0 ...   0   0   1]
 [  0   0   0 ...   0   0   1]]


In [32]:
# training set
X = data_pad

# testing set (one-hot codding matrix where [1, 0] means positve results and [0, 1] means negative)
Y = np.array([[1, 0]]*count_pos + [[0, 1]]*count_neg)
print(X.shape, Y.shape)

(1054, 10) (1054, 2)


In [34]:
# Data shuffling
# When we are working with machine learning data, it is important to randomly shuffle the data to avoid possible training biases.
# This helps the model generalize better to new data.
indeces = np.random.choice(X.shape[0], size=X.shape[0], replace=False)
X = X[indeces]
Y = Y[indeces]

In [35]:
# RNN model creating using LSTM method
model = Sequential()

# also using Embedding method at first NN layer
model.add(Embedding(maxWordsCount, 128, input_length = max_text_len))
#  return_sequences = True - this means that the connection of one recurrent layer to another is expected
model.add(LSTM(128, return_sequences = True))
model.add(LSTM(64))
# we have 2 neuros in the last NN layer because we literaly have binary classification task
model.add(Dense(2, activation='softmax'))

# all parametrs output
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 128)           640000    
                                                                 
 lstm (LSTM)                 (None, 10, 128)           131584    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 821122 (3.13 MB)
Trainable params: 821122 (3.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [60]:
# RNN model creating using GRU method
# it works faster than LSTM, but NN accuracy could be lower

model = Sequential()
model.add(Embedding(maxWordsCount, 128, input_length = max_text_len))
model.add(GRU(128, return_sequences = True))
model.add(GRU(64))
model.add(Dense(2, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10, 128)           640000    
                                                                 
 gru (GRU)                   (None, 10, 128)           99072     
                                                                 
 gru_1 (GRU)                 (None, 64)                37248     
                                                                 
 dense_1 (Dense)             (None, 2)                 130       
                                                                 
Total params: 776450 (2.96 MB)
Trainable params: 776450 (2.96 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [61]:
model.compile(loss='categorical_crossentropy', metrics = ['accuracy'], optimizer=Adam(0.0001))

In [62]:
modeling = model.fit(X, Y, batch_size=32, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [65]:
# this line creates a dictionary, which converts indices back to words
index_to_word = dict(map(reversed, tokenizer.word_index.items()))

# this function converts indices back to words using "index_to_word" dictionary
def sequence_to_text(index_list):
    words = [index_to_word.get(letter) for letter in index_list]
    return(words)

# we taking any example from our training data (positive.txt and negative.txt)
t = "The beauty of life is in the small, everyday moments.".lower()
# we putting our example phrase into index form
data = tokenizer.texts_to_sequences([t])

# here we checking checking lenght of test phrase and changing it if necessary
data_pad = pad_sequences(data, maxlen=max_text_len)

# we may check how our RNN handles given phrase
# print( sequence_to_text(data[0]) )

# RNN prediction based on data_pad
res = model.predict(data_pad)

# if RNN returns vector [0.1], the phrase is positive, if returned vector is closer to 1.0], phrase is negative
# this string could output values of vector, where np.argmax(res) is index of each element in "res" object
# print(res, np.argmax(res), sep='\n')


if np.argmax(res) == 1:
  print("Phrase is pessimistic")
elif np.argmax(res) == 0:
  print("Phrase is optimistic")

Phrase is optimistic
