In [0]:
# nlp with recurrent neural networks
# autocheck word complete grammer check translation chatbot
# sentiment analysis / character generation

In [0]:
# bag of words implementation


def bag_of_words(text):

  # find the words
  words = text.lower().split(' ')
  bag = {}
  vocab = {}
  word_encoding = 1

  for word in words:
    if word in vocab:
      encoding = vocab[word]
    else:
      vocab[word] = word_encoding
      encoding = word_encoding
      word_encoding += 1

    if encoding in bag:
      bag[encoding] += 1
    else:
      bag[encoding] = 1

  return bag, vocab

text = 'TF is Awesome TF'
bag, vocab = bag_of_words(text)
print(bag)
print(vocab)


{1: 2, 2: 1, 3: 1}
{'tf': 1, 'is': 2, 'awesome': 3}


In [0]:
# embedding turns words into vectors.
# the vector has a defined length and shows similarity to other words.
# embedding layer tries to learn relations between words
# so that more similar words have closer vectors (with lower angles).
# pretrained word embeddings are availble also.

In [0]:
# imdb reviews
# alreadly numberized by community

In [0]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [0]:
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [0]:
len(train_data[0]), len(train_data[1])

(218, 189)

In [0]:
# padding all sequences to the length of 250. adds zeros to front
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

In [0]:
len(train_data[0]), len(train_data[1])

(250, 250)

In [0]:
# model
inputs = tf.keras.Input(shape=(250,))
embed = tf.keras.layers.Embedding(VOCAB_SIZE, 32)(inputs)
lstm1 = tf.keras.layers.LSTM(32)(embed)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(lstm1)
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

In [0]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 250)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 250, 32)           2834688   
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb2d4ad8dd8>

In [0]:
results = model.evaluate(test_data, test_labels)
results



[0.7219229936599731, 0.8379600048065186]

In [0]:
# making predictions

word_index = imdb.get_word_index()

def encode_text(text):
  tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
  print(tokens)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  # works on list of lists, so the first one is only required.
  return sequence.pad_sequences([tokens], MAXLEN)[0]


text = 'that movie was just amazing, so amazing'
encoded = encode_text(text)
print(encoded)

['that', 'movie', 'was', 'just', 'amazing', 'so', 'amazing']
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   

In [0]:
import numpy as np
def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1, 250))
  pred[0] = encoded_text
  # list of lists
  result = model.predict(pred)
  print(result[0])


positive_review = 'That movie was so awesome! I really loved it and would watch it again because it was amazingly great'
predict(positive_review)
negative_review = '''That movie sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched'''
predict(negative_review) 

['that', 'movie', 'was', 'so', 'awesome', 'i', 'really', 'loved', 'it', 'and', 'would', 'watch', 'it', 'again', 'because', 'it', 'was', 'amazingly', 'great']
[0.986652]
['that', 'movie', 'sucked', 'i', 'hated', 'it', 'and', "wouldn't", 'watch', 'it', 'again', 'was', 'one', 'of', 'the', 'worst', 'things', "i've", 'ever', 'watched']
[0.0868957]
