<a href="https://colab.research.google.com/github/GauraoM/Tensorflow/blob/main/Natural_Language_Processing_with_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Bag of words
 It does not maintain the order of the words but does keep track of the frequency.

In [1]:
vocab = {}  # maps word to integer representing it
word_encoding = 1
def bag_of_words(text):
  global word_encoding

  words = text.lower().split(" ")  # create a list of all of the words in the text
  bag = {}  # stores all of the encodings and their frequency

  # Iterate over words
  for word in words:
    if word in vocab:
      encoding = vocab[word]  # get encoding from vocab
    else:
      vocab[word] = word_encoding # Assign word_encodeing to the word
      encoding = word_encoding 
      word_encoding += 1
    
    if encoding in bag:
      bag[encoding] += 1
    else:
      bag[encoding] = 1
  
  return bag

text = "this is a test to see if this test will work is is test a a"
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


#### IMDB Movie Review Dataset
This dataset contains 25,000 reviews from IMDB where each one is already preprocessed and has a label as either positive or negative.

In [2]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

#load the data
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
# Check one review
train_data[1]

[1,
 194,
 1153,
 194,
 8255,
 78,
 228,
 5,
 6,
 1463,
 4369,
 5012,
 134,
 26,
 4,
 715,
 8,
 118,
 1634,
 14,
 394,
 20,
 13,
 119,
 954,
 189,
 102,
 5,
 207,
 110,
 3103,
 21,
 14,
 69,
 188,
 8,
 30,
 23,
 7,
 4,
 249,
 126,
 93,
 4,
 114,
 9,
 2300,
 1523,
 5,
 647,
 4,
 116,
 9,
 35,
 8163,
 4,
 229,
 9,
 340,
 1322,
 4,
 118,
 9,
 4,
 130,
 4901,
 19,
 4,
 1002,
 5,
 89,
 29,
 952,
 46,
 37,
 4,
 455,
 9,
 45,
 43,
 38,
 1543,
 1905,
 398,
 4,
 1649,
 26,
 6853,
 5,
 163,
 11,
 3215,
 10156,
 4,
 1153,
 9,
 194,
 775,
 7,
 8255,
 11596,
 349,
 2637,
 148,
 605,
 15358,
 8003,
 15,
 123,
 125,
 68,
 23141,
 6853,
 15,
 349,
 165,
 4362,
 98,
 5,
 4,
 228,
 9,
 43,
 36893,
 1157,
 15,
 299,
 120,
 5,
 120,
 174,
 11,
 220,
 175,
 136,
 50,
 9,
 4373,
 228,
 8255,
 5,
 25249,
 656,
 245,
 2350,
 5,
 4,
 9837,
 131,
 152,
 491,
 18,
 46151,
 32,
 7464,
 1212,
 14,
 9,
 6,
 371,
 78,
 22,
 625,
 64,
 1382,
 9,
 8,
 168,
 145,
 23,
 4,
 1690,
 15,
 16,
 4,
 1355,
 5,
 28,
 6,
 52,
 

#### Preprocessing
As each word is of different length we can't proceed with that so se havs to do some padding

In [4]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

#### Creating the model

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32), # 32 is the output dimenssion of the vector generated by embeddings
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


#### Train the model

In [7]:
# Compile the model
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['acc'])
 # Train the model with the validation split of 20%
history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Evaluate the performance

In [8]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.47738516330718994, 0.8573600053787231]


#### Making predictions
Since reviews are encoded we will need to convert any review that we write into that form so the network can understand it. 
To do that well load the encodings from the dataset and use them to encode our own data.

In [9]:
# load the encodeings
word_index= imdb.get_word_index()

# function to encode the text
def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text) # Converted the word into token
  tokens = [word_index[word] if word in word_index else 0 for word in tokens] # assign word index if it present else return 0
  return sequence.pad_sequences([tokens], MAXLEN)[0]


text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0 

In [13]:
# Decoding the text

reverse_word_index = {value: key for (key,value) in word_index.items()}

def decode_integers(integers):
  PAD = 0
  text = ""
  # Iterating over integer 
  for num in integers:
    if num != PAD: # If it is not zero 
       text += reverse_word_index[num] + " "

  return text[:-1] 

print(decode_integers(encoded))      


that movie was just amazing so amazing


In [14]:
# now time to make a prediction

def predict(text):
  encoded_text = encode_text(text) # Encode the text
  pred = np.zeros((1,250)) 
  pred[0] = encoded_text # Insert the encoded text
  result = model.predict(pred) 
  print(result[0])

positive_review = "That movie was! really loved it and would great watch it again because it was amazingly great"
predict(positive_review)

negative_review = "that movie really sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched"
predict(negative_review)

[0.960262]
[0.6202364]


### RNN play generator
Hers we will simply show the RNN an example of something we want it to recreate and it will learn how to write a version of it on its own

In [43]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [44]:
# Get the dataset
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [45]:
# Read the content of file

# Read, then decode
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [46]:
# print first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [48]:
vocab = sorted(set(text)) #get unique characters in a vocabulory

# Mapping the unique characters to indices
char2idx = {u:i for i,u in enumerate(vocab)} # get the indices for letter
print(char2idx)
idx2char = np.array(vocab) # Convert to an array so we can just use index at which a letter appears
print(idx2char)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [49]:
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [50]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
    #print(ints)
  except:
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


#### Creating training example

In [51]:
seq_length = 100  # length of sequence for a training example
examples_per_epoch = len(text)//(seq_length+1) #as we require 100 exaples per epoch

# Create training examples / targets
# convert entire string of data into characters
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [52]:
# Convert the stream of characters into batch of desired length
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [54]:
def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello

dataset = sequences.map(split_input_target) # apply to every entry above

#### Make Training batches 

In [55]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)  # vocab is number of unique characters
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

#### Build the model

In [68]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (64, None, 256)           16640     
                                                                 
 lstm_3 (LSTM)               (64, None, 1024)          5246976   
                                                                 
 dense_3 (Dense)             (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


#### Creating Loss Function

In [57]:
# Looking at sample input and output for the untrained model

for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch)  # ask model for a prediction on our first batch of training data (64 entries)
  print(example_batch_predictions.shape, ": (batch_size, sequence_length, vocab_size)") 

(64, 100, 65) : (batch_size, sequence_length, vocab_size)


In [58]:
# Now we can see that the predicition is an array of 64 arrays, one for each entry in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[-3.83939850e-03  3.58762394e-04 -1.93279632e-03 ... -2.76192511e-03
   -2.69576744e-03 -8.59825930e-04]
  [-6.21288968e-03 -4.17579402e-04 -4.10450110e-03 ... -4.35518287e-03
   -4.66823531e-03 -2.61309184e-03]
  [ 2.43651331e-03 -2.34970776e-03 -2.72694830e-04 ...  5.72698307e-04
   -3.08301486e-03 -3.37857124e-03]
  ...
  [-4.54373937e-03 -1.20143257e-02  4.02230391e-04 ... -1.09200692e-03
   -1.37980573e-03  1.17104976e-02]
  [-9.58475284e-04 -3.45858093e-03 -1.21976202e-03 ...  2.44349777e-03
   -3.06397630e-03  1.20440405e-02]
  [-5.10210264e-03 -2.17005261e-03 -5.84709598e-03 ...  9.00705904e-03
    4.59635118e-03  8.84665363e-03]]

 [[-1.47111085e-03 -2.14975653e-03 -5.36827696e-03 ...  2.00513913e-03
    1.27182459e-03  5.73710725e-03]
  [ 2.56006746e-03 -3.26226000e-03 -4.34269151e-03 ...  2.21574563e-03
    6.30671624e-03  1.36583834e-03]
  [ 6.32131193e-03  6.04782312e-04 -7.64064584e-03 ...  4.14719619e-03
    6.66456670e-03 -5.53358486e-03]
  ...
  [ 2.348

In [59]:
# lets examine one prediction
pred = example_batch_predictions[0]
print(len(pred))
# this is a 2d array of length 100, where each interior array is the prediction for the next character at each time step
print(pred)

100
tf.Tensor(
[[-0.0038394   0.00035876 -0.0019328  ... -0.00276193 -0.00269577
  -0.00085983]
 [-0.00621289 -0.00041758 -0.0041045  ... -0.00435518 -0.00466824
  -0.00261309]
 [ 0.00243651 -0.00234971 -0.00027269 ...  0.0005727  -0.00308301
  -0.00337857]
 ...
 [-0.00454374 -0.01201433  0.00040223 ... -0.00109201 -0.00137981
   0.0117105 ]
 [-0.00095848 -0.00345858 -0.00121976 ...  0.0024435  -0.00306398
   0.01204404]
 [-0.0051021  -0.00217005 -0.0058471  ...  0.00900706  0.00459635
   0.00884665]], shape=(100, 65), dtype=float32)


In [60]:
# finally we look at a prediction at the first timestep
time_pred = pred[0]
print(len(time_pred))
print(time_pred) #65 values representing the probabillity of each character occuring next

65
tf.Tensor(
[-0.0038394   0.00035876 -0.0019328   0.00399611 -0.00434391 -0.00574383
 -0.0024333   0.00377613 -0.00123772 -0.00090148  0.00364577  0.00380444
 -0.00220031 -0.00024531 -0.00037837  0.00301075  0.00375164 -0.00179855
  0.00521792 -0.00226365 -0.0043562  -0.00146387  0.00093601  0.00300207
 -0.00033476 -0.00375416  0.00023324  0.00144241  0.00677537 -0.00552182
  0.00246178 -0.00086393 -0.00212277 -0.00114848 -0.00132068  0.00303585
 -0.00155842  0.00346775 -0.00164634  0.00100648 -0.00131003 -0.00079564
  0.00129408 -0.00457165  0.00111241  0.00406929  0.00298475 -0.00017149
 -0.00220009 -0.00133352 -0.00031319 -0.0003282   0.00836229  0.00095998
  0.00649619 -0.00046698 -0.00149029 -0.00011604  0.00488313 -0.00173706
  0.00288528 -0.00386057 -0.00276193 -0.00269577 -0.00085983], shape=(65,), dtype=float32)


In [61]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [62]:
# Compile the model
model.compile(optimizer='adam', loss=loss)

In [63]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [64]:
history = model.fit(data, epochs=20, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#### Loading the model
rebuild the model from a checkpoint using a batch_size of 1 so that we can feed one peice of text to the model and have it make a prediction.

In [69]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [70]:
# Load the weights
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [74]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 800

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0) #[[]]

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
 
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
    
      predictions = tf.squeeze(predictions, 0) #convert [[]] to []

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [75]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))

Type a starting string: romeo
romeother honour,
Say you that some warrant with this kneel
Scoppary help there. -cause me not too much;
If you take order,
That he will raise her toman and twenty steed
Through disting'd the noon-thou for your countrymand.

YORK:
Shall we to Choid upon your housey--and foothus to
your command. What is the bed by the botton jury out,
And still their fearful arms in Death.

KING RICHARD II:
O Bulourelord gops himself into grief:
Now I must resolve me to thee! aras calm, come both.

CATESBY:
I'st it good, more:
They shall yet dead a
Florious shoulder mine: I did se out
And throws what bad, what, and kiss you in mine ears,
His glassy counsel, lords, with whom and clear
Makes me not pitizen:
And, for the morning pewders did exel him.
First Warwick, this is upon her maid-paleaple.

PETRUCHIO:
What?

