# Week 9: Text embeddings

### Rasika Bhalerao

### Agenda:
- Intro to [assignment 9](https://github.com/MIDS-W207/coursework_2022/blob/main/Homework/09%20Embeddings%20for%20Text.ipynb)

In [20]:
### FROM HW9 ###


# Import the libraries we'll use below.
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style
import plotly.graph_objs as plotly  # for interactive plots

import tensorflow as tf
from tensorflow.keras.datasets import imdb

In [21]:
(X_train, Y_train), (X_test, Y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)

# The imdb dataset comes with an index mapping words to integers.
# In the index the words are ordered by frequency they occur.
index = imdb.get_word_index()

# Because we used index_from=3 (above), setting aside ids below 3 for special
# symbols, we need to add 3 to the index values.
index = dict([(key, value+3) for (key, value) in index.items()])

# Create a reverse index so we can lookup tokens assigned to each id.
reverse_index = dict([(value, key) for (key, value) in index.items()])
reverse_index[1] = '<START>'  # start of input
reverse_index[2] = '#'        # out-of-vocabulary (OOV)
reverse_index[3] = '<UNUSED>'

max_id = max(reverse_index.keys())

def decode(token_ids):
  """Return a string with the decoded text given a list of token ids."""
  # Try looking up each id in the index, but return '#' (for OOV) if not found.
  tokens = [reverse_index.get(i, "#") for i in token_ids]

  # Connect the string tokens with a space.
  return ' '.join(tokens)


def pad_data(sequences, max_length):
  # Keras has a convenient utility for padding a sequence.
  # Also make sure we get a numpy array rather than an array of lists.
  return np.array(list(
      tf.keras.preprocessing.sequence.pad_sequences(
          sequences, maxlen=max_length, padding='post', value=0)))

# Pad and truncate to 300 tokens.
X_train_padded = pad_data(X_train, max_length=300)


def limit_vocab(sequences, max_token_id, oov_id=2):
  """Replace token ids greater than or equal to max_token_id with the oov_id."""
  reduced_sequences = np.copy(sequences)
  reduced_sequences[reduced_sequences >= max_token_id] = oov_id
  return reduced_sequences

# Reduce vocabulary to 1000 tokens.
X_train_reduced = limit_vocab(X_train_padded, max_token_id=1000)

# Keras has a util to create one-hot encodings.
X_train_padded = pad_data(X_train, max_length=20)
X_train_reduced = limit_vocab(X_train_padded, max_token_id=1000)
X_train_one_hot = tf.keras.utils.to_categorical(X_train_reduced)



def build_embeddings_model(average_over_positions=False,
                           vocab_size=1000,
                           sequence_length=20,
                           embedding_dim=2):
  """Build a tf.keras model using embeddings."""
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.random.set_seed(0)

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(
      input_dim=vocab_size,
      output_dim=embedding_dim,
      input_length=sequence_length)
  )

  if average_over_positions:
    # This layer averages over the first dimension of the input by default.
    model.add(tf.keras.layers.GlobalAveragePooling1D())
  else:
    # Concatenate.
    model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(
      units=1,                     # output dim (for binary classification)
      activation='sigmoid'         # apply the sigmoid function!
  ))

  model.compile(loss='binary_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])

  return model

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [54]:
embedding_size = 50
vocab_size = 100000

model = build_embeddings_model(average_over_positions=True,
                               vocab_size=vocab_size,
                               sequence_length=20,
                               embedding_dim=embedding_size)

embeddings = model.layers[0].get_weights()[0]

In [62]:
tokens = [reverse_index[i] for i in range(1, 100)] # tokens for first 100 words
print(tokens[10:20])
print([index[word] for word in tokens[10:20]]) # index of each of those 10 words

['in', 'it', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie']
[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [64]:
print(reverse_index[5]) # the 5th word
embeddings[5] # embedding for 5th word

and


array([-0.02587608,  0.0494658 ,  0.02971274,  0.03185178,  0.01259113,
       -0.00535594,  0.03925725,  0.03887851,  0.04810688,  0.0481171 ,
        0.03332932, -0.0170391 , -0.04294174, -0.00588974, -0.01037066,
        0.036463  , -0.02343742,  0.02986176, -0.01945804, -0.00665743,
        0.02225571, -0.01584575, -0.01891565, -0.02592481, -0.0365484 ,
        0.02132643,  0.00816028,  0.02198832,  0.01219358,  0.0487073 ,
       -0.01504239, -0.01183355, -0.0479692 , -0.03535795, -0.03271524,
       -0.04155707, -0.0199154 , -0.03582253, -0.01198497, -0.04797846,
       -0.00686462, -0.02199868,  0.00723056,  0.02404376,  0.03393928,
        0.01577142, -0.01272587, -0.02415756,  0.0410876 ,  0.02247739],
      dtype=float32)

In [66]:
len(embeddings[5])

50

In [67]:
from scipy import spatial

In [69]:
word1 = 'number'
word2 = 'data'

embedding1 = embeddings[index[word1]]
embedding2 = embeddings[index[word2]]
# print(f"{word1}: {embedding1}")
# print(f"{word2}: {embedding2}")
print(f"Cosine similarity: {1 - spatial.distance.cosine(embedding1, embedding2)}")

Cosine similarity: -0.04787294566631317
