<a href="https://colab.research.google.com/github/Kondwani7/IMDB_reviews-with-LSTMs/blob/main/IMDB_movies_Sentimental_Analysis_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#bag of words encoding, based on word frequency
vocab = {} # mapping a word to an integer encoding
word_encoding = 1
def bag_of_words(text):
  global word_encoding

  words = text.lower().split(' ')#ensuring text is all in lower case
  bag = {}# store encodings and their frequency
  for word in words:
    if word in vocab:
      encoding = vocab[word] # get a specific encoding from the vocab
    else:
      vocab[word] = word_encoding
      encoding = word_encoding
      word_encoding += 1

    if encoding in bag:
      bag[encoding] += 1 
    else:
      bag[encoding] = 1
  return bag

text = 'testing bag to see it works in frequency yep, basically thats it'
bag = bag_of_words(text)
print(bag)
print(vocab)


{1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1}
{'testing': 1, 'bag': 2, 'to': 3, 'see': 4, 'it': 5, 'works': 6, 'in': 7, 'frequency': 8, 'yep,': 9, 'basically': 10, 'thats': 11}


In [None]:
#sentimental analysis testing the bag of words model
postive_review_1 = 'I thought that new coca cola ad was bad and pretty irriating'
negative_review = 'I thought that the new coca cola advert was good and creative'
postive_review_2 = 'That coca cola ad was crazy lit'

pos_bag_1 = bag_of_words(postive_review_1)
neg_bag = bag_of_words(negative_review)
pos_bag_2 = bag_of_words(postive_review_2)


print("Positive review:", pos_bag_1)
print("Negative review:", neg_bag)
print('Positive review', pos_bag_2)

Positive review: {12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1}
Negative review: {12: 1, 13: 1, 14: 1, 24: 1, 15: 1, 16: 1, 17: 1, 25: 1, 19: 1, 26: 1, 21: 1, 27: 1}
Positive review {14: 1, 16: 1, 17: 1, 18: 1, 19: 1, 28: 1, 29: 1}


In [None]:
#word embedding, it ensures the order of the sentence remains the same whilst grouping similar words
#lstms & rnns
#working the imdb movies dataset, sentimental analysis
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np
VOCAB_SIZE = 88584
MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [None]:
'''
to ensure the comments are at a consistent length, 250 words, will trim excess words on comments with more than 250 words, and 
add zeros to comments with worse than 250 words
'''
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)


In [None]:
#building a test model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 32)          2834688   
_________________________________________________________________
lstm_9 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop',metrics=['accuracy'])
history = model.fit(train_data, train_labels, epochs=10, validation_split=0.25)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#evaluate the performance with test data
results = model.evaluate(test_data, test_labels)
print(results)

[0.5270854234695435, 0.8525999784469604]


In [None]:
#plot accuracy
import matplotlib.pyplot as plt
history_dict = history.history
print(history_dict.keys())

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
#make predictions with our on review on a movie
word_index = imdb.get_word_index()

def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens], MAXLEN)[0]

text = 'Bro that end game was crazy amazing. Probably the best avengers game I\'ve ever watched'
encoded = encode_text(text)
print(encoded)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0   

In [None]:
# a decoding function to analyse a individual review
reverse_word_index = {value: key for (key, value) in word_index.items()}
def decode_review(integers):
  PAD = 0
  text = ""
  for num in integers:
    if num != PAD:
      text += reverse_word_index[num] + " "
    return text[:-1]
print(decode_integers(encoded))
  


bro that end game was crazy amazing probably the best avengers game i've ever watched
