# NLP
### 1. Bag of Words

In [1]:
vocab = {}  # maps word to integer representing it
word_encoding = 1
def bag_of_words(text):
  global word_encoding

  words = text.lower().split(" ")  # create a list of all of the words in the text, well assume there is no grammar in our text for this example
  bag = {}  # stores all of the encodings and their frequency

  for word in words:
    if word in vocab:
      encoding = vocab[word]  # get encoding from vocab
    else:
      vocab[word] = word_encoding
      encoding = word_encoding
      word_encoding += 1
    
    if encoding in bag:
      bag[encoding] += 1
    else:
      bag[encoding] = 1
  
  return bag

text = "this is a test to see if this test will work is is test a a"
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


Bag of Words loses the order in which words appear. When this happens, 2 sentences (as below) that have opposite meanings are taken to be the same.

In [2]:
positive_review = "I thought the movie was going to be bad but it was actually amazing"
negative_review = "I thought the movie was going to be amazing but it was actually bad"

pos_bag = bag_of_words(positive_review)
neg_bag = bag_of_words(negative_review)

print("Positive:", pos_bag)
print("Negative:", neg_bag)

Positive: {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1}
Negative: {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 21: 1, 18: 1, 19: 1, 20: 1, 17: 1}


### 2. LSTM
Sentiment analysis of moview reviews

In [3]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np

##### Load dataset
We'll use the IMDB movie review dataset from keras - containing 25,000 reviews where each is already preprocessed and has a label as either 'positive' or 'negative'. <br>
Each review is encoded by integers that represents how common a word is in the entire dataset. For example, a word encoded by the integer 3 means that it is the 3rd most common word in the dataset.

In [4]:
VOCAB_SIZE = 88584 # no. of unique words
MAXLEN = 250 # length of longest review (no. of words)
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

# Let's look at 1 review
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

##### Data preprocessing
We have to make all our reviews be of equal lengths  before we pass them into neural networks.<br>
So we follow the procedure below:
- if review > 250 words, trim off extra words
- if review < 250 words, pad remaining spaces with 0s

In [7]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

##### Build the model

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32), # 32 stands for vector dimensions for each word (but this can be changed)
    tf.keras.layers.LSTM(32), 
    tf.keras.layers.Dense(1, activation='sigmoid')
    # 'sigmoid' cuz result has to be between 0 & 1
    # >0.5 is +ve
    # <0.5 is -ve
])

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


##### Train the model

In [12]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
history = model.fit(
    train_data, train_labels, epochs=10, validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


##### Test the model

In [11]:
#model.save("4a) LSTM.h5")
new_model = tf.keras.models.load_model('4a) LSTM.h5')

results = new_model.evaluate(test_data, test_labels)
print(results)

[0.4178619980812073, 0.8602399826049805]


##### Make predictions

In [12]:
# Get lookup table - word-to-int mapping
word_index = imdb.get_word_index()

def encode_text(text):
    # tokens are the individual words themselves
    tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
    # create a for loop: if word is in mapping/vocabulary, replace with integer that represents it, otherwise put 0 (ie. we don't know what that word means)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens], MAXLEN)[0]

# in case we have any movie reviews in code format
reverse_word_index = {value: key for (key, value) in word_index.items()}
def decode_ints(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + " "
    return text[:-1] # return everything except the last space

In [13]:
#text = input("Please enter a movie review (string):")
text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)
print(decode_ints(encoded))

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

If you see a large bunch of zeroes, they might just be padding.

In [14]:
def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result = new_model.predict(pred)
    print(result[0])

positive_review = "That movie was so awesome! I really loved it and would watch it again because it was amazingly great!"
predict(positive_review)

negative_review = "That movie sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched."
predict(negative_review)

[0.91993374]
[0.3243059]


In [17]:
predict("and bad bad bad bad bad.")

[0.292518]


### 3. RNN Play Generator
Predict the next character in a play's script

In [18]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

##### Load dataset
To train our neural network, we'll use an extract from a Shakespeare play.

In [20]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

If you want to load your own data instead, you can run the code below. <br>
It allows you t upload a text file from a dialog box.

In [None]:
from google.colab import files
path_to_file = list(files.upload().keys())[0]

In [23]:
# Read and decode file
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# Length of text = no. of characters in it
print(f"Length of text: {len(text)}\n")
print(text[:250])

Length of text: 1115394

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



##### Data preprocessing

In [28]:
# Get unique characters in text
vocab = sorted(set(text))

# Create a mapping from unqiue characters to indices
char2idx = {u:i for i,u in enumerate(vocab)}
# Reverse mapping
idx2char = np.array(vocab)

# convert our string into an array of indices
def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

# Let's look at how our text is encoded
print(f"Text: {text[:13]}")
print(f"Encoded: {text_to_int(text[:13])}")

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [29]:
# While we're at it, here's a function that converts numeric values to text
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


##### Create Training Examples
Now we'll splitour text data from above to manu shorter sequences that we can pass to the model as traning examples. <br>

Sample I/O:<br>
```input: 'Hell' | output: 'ello'```

We first create a stream of chars from our text data.

In [31]:
seq_length = 100 # no. of chars per sequence
examples_per_epoch = len(text)//(seq_length)

# Create traininhg examples
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

Now we convert the stream of chars into batches of 101 chars.

In [32]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

Now we split these sequences of 101 chars into input and output.

In [33]:
def split_input_target(chunk):
    input_text = chunk[:-1] # 'hell'
    target_text = chunk[1:] # 'ello'
    return input_text, target_text

# we use map to apply the above function to every entry
dataset = sequences.map(split_input_target)

for x,y in dataset.take(2):
    print("\n\nEXAMPLE\n")
    print("INPUT")
    print(int_to_text(x))
    print("\nOUTPUT")
    print(int_to_text(y))

Finally, we make training batches

In [34]:
BATCH_SIZE = 64 # 64 training examples
VOCAB_SIZE = len(vocab) # vocab = no. of unique chars in each example
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

##### Build the model

In [37]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), # None = we don't know how many sequences are there
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
        # last layer produces a probability distribution of all characters, showing likelihood of each appering next
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (64, None, 256)           16640     
                                                                 
 lstm_2 (LSTM)               (64, None, 1024)          5246976   
                                                                 
 dense_2 (Dense)             (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


##### Create a loss function
But before that, let's look at a sample input & output from our untrained model.

In [40]:
for input_example_batch, target_example_batch in data.take(1):
    # ask our model for a prediction on 1st batch/training example
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    # where vocab_size of 65 is for probability distribution of all 65 characters

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [41]:
# prediction is an array of 64 arrays, each array represents 1 batch/training example
print(len(example_batch_predictions)) # batch_size
print(example_batch_predictions)

64
tf.Tensor(
[[[ 1.53217535e-03  1.30362343e-03  3.74420267e-03 ... -7.37007894e-03
    2.88063148e-03 -9.65256430e-03]
  [ 6.19959552e-03  5.22169191e-03  9.13008116e-03 ... -1.58855598e-03
    6.22765953e-03 -1.18222255e-02]
  [ 4.37529664e-03 -4.17531293e-04  3.70234950e-03 ... -8.30525532e-05
    3.70584428e-03 -2.41887709e-03]
  ...
  [ 8.00169911e-03 -2.11814279e-03  1.04231182e-02 ... -8.20770394e-04
    1.85711146e-03 -4.35768720e-03]
  [ 2.28129956e-03  1.18407386e-03  1.23468544e-02 ... -2.38441117e-03
   -3.18082469e-03 -9.13249049e-03]
  [-1.39471353e-03  9.52456379e-04  6.47942629e-03 ... -3.01547442e-03
   -1.65773788e-03 -6.94557093e-03]]

 [[ 4.80364403e-03 -5.42866299e-03 -2.92617385e-03 ... -3.25497705e-03
    7.76823610e-04 -3.98815051e-03]
  [ 2.10861699e-03 -9.13864002e-03 -3.13550793e-03 ... -3.75876203e-03
   -7.67454039e-04 -1.78679032e-03]
  [-3.27400514e-03  7.58928712e-04 -3.78953945e-03 ... -3.90358455e-03
    3.71874310e-03  2.44273013e-03]
  ...
  [-3.340

In [44]:
# Let's examine 1 prediction
pred = example_batch_predictions[0]
print(len(pred)) # sequence_length
print(pred)

100
tf.Tensor(
[[ 1.53217535e-03  1.30362343e-03  3.74420267e-03 ... -7.37007894e-03
   2.88063148e-03 -9.65256430e-03]
 [ 6.19959552e-03  5.22169191e-03  9.13008116e-03 ... -1.58855598e-03
   6.22765953e-03 -1.18222255e-02]
 [ 4.37529664e-03 -4.17531293e-04  3.70234950e-03 ... -8.30525532e-05
   3.70584428e-03 -2.41887709e-03]
 ...
 [ 8.00169911e-03 -2.11814279e-03  1.04231182e-02 ... -8.20770394e-04
   1.85711146e-03 -4.35768720e-03]
 [ 2.28129956e-03  1.18407386e-03  1.23468544e-02 ... -2.38441117e-03
  -3.18082469e-03 -9.13249049e-03]
 [-1.39471353e-03  9.52456379e-04  6.47942629e-03 ... -3.01547442e-03
  -1.65773788e-03 -6.94557093e-03]], shape=(100, 65), dtype=float32)


In [45]:
# prediction at 1st time step
time_pred = pred[0]
print(len(time_pred)) # vocab_size
print(time_pred)

65
tf.Tensor(
[ 0.00153218  0.00130362  0.0037442   0.00089489  0.00871048  0.0034148
 -0.00145947  0.00774516 -0.0098937   0.00969511  0.00694717  0.00508652
 -0.00445506 -0.00489236 -0.00845601  0.00955643 -0.00515079 -0.01234208
  0.00113197  0.00020985  0.00670161 -0.00108051  0.00766491 -0.00369447
 -0.00352706  0.01216662  0.00113487 -0.00566349  0.00261889 -0.00186958
  0.01317033 -0.00246003 -0.0068601   0.00423525 -0.00493313 -0.00580219
 -0.00299917 -0.00647317  0.00350202  0.00036329 -0.0055357   0.00610654
  0.0058257   0.00508437  0.0028355   0.01812125  0.00082312 -0.01246747
 -0.000671   -0.00804754 -0.00428832  0.00793601  0.01125428 -0.00628819
  0.00182609  0.01236995 -0.00641539 -0.00023204  0.00413185  0.00211452
  0.00127123 -0.00070246 -0.00737008  0.00288063 -0.00965256], shape=(65,), dtype=float32)


In [47]:
# To get predicted character, we sample the output distribution (pick a char based on probability)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# Reshape array & convery all ints to numbers to see actual chars
sampled_indices = np.reshape(sampled_indices, (1,-1))[0]
predicted_chars = int_to_text(sampled_indices)

print(predicted_chars)

,vP-cr
h-sHCQcw&,yQiFneZnfSav3ovXoUP,m!kuRGzrM;t$!pjlxlqld?HtI'bEa $
 Qw!VTETSuy?$PciWLIM?NEFTeuKN?D


Since model is untrained, what you see above is stream of predicted chars based on random weights & biases.

In [49]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

##### Compile the model

In [50]:
model.compile(optimizer='adam', loss=loss)

##### Create checkpoints
Setup and configure model to save checkpoint as it trains. This allows loading model from a checkpoint & continue training it.

In [53]:
# Directory where checkpoints will be saved
checkpoint_dir = './4b) training_checkpoints'
# Name of checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)

##### Train the model

In [54]:
history = model.fit(data, epochs=40, callbacks=[checkpoint_callback])

Epoch 1/40
 40/172 [=====>........................] - ETA: 16:22 - loss: 3.3271

KeyboardInterrupt: 

##### Load the model
We'll rebuild the model from a checkpoint using a batch_size of 1 so that we can feed one peice of text to the model and have it make a prediction.

In [None]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

Once the model finishes training, we can find the latest checkpoint that stores the model's weights.

In [None]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))