# Embeddings

An embedding maps discrete, categorical values to a continous space. Major advances in NLP applications have come from these continuous representations of words.


# 1)- Importing key Modules

In [0]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [0]:
# For data processing and maths
import numpy as np
import pandas as pd
import time
import tqdm
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [0]:
# For pyTorch
import torch
from torch.nn.functional import one_hot
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [4]:
! pip install version_information



In [5]:
# first install: pip install version_information
%reload_ext version_information
%version_information pandas,torch,numpy,seaborn, matplotlib

Software,Version
Python,3.6.8 64bit [GCC 8.0.1 20180414 (experimental) [trunk revision 259383]
IPython,5.5.0
OS,Linux 4.14.137+ x86_64 with Ubuntu 18.04 bionic
pandas,0.24.2
torch,1.3.0+cu100
numpy,1.16.5
seaborn,0.9.0
matplotlib,3.0.3
Mon Oct 21 10:59:51 2019 UTC,Mon Oct 21 10:59:51 2019 UTC


# 2)- One-Hot Encoding

example 1

In [6]:
sentence = "the quick brown fox jumped over the lazy dog"
words = sentence.split(' ')
print(words)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


In [7]:
vocab1 = list(set(words))
print(vocab1)

['lazy', 'over', 'brown', 'fox', 'jumped', 'the', 'quick', 'dog']


In [8]:
# Number of words in our vocabulary
len(vocab1)

8

In [9]:
# Convert words to indexes
word_to_ix1 = {word: i for i, word in enumerate(vocab1)}
print(word_to_ix1)

{'lazy': 0, 'over': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'the': 5, 'quick': 6, 'dog': 7}


In [10]:
type(word_to_ix1)

dict

In [11]:
word_to_ix1['over']

1

In [12]:
word_to_ix1['dog']

7

In [13]:
from torch.nn.functional import one_hot

words = torch.tensor([word_to_ix1[w] for w in vocab1], dtype=torch.long)

one_hot_encoding = one_hot(words)
print(vocab1)
print(one_hot_encoding)

['lazy', 'over', 'brown', 'fox', 'jumped', 'the', 'quick', 'dog']
tensor([[1, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 1]])


There are alot of zeros and if we have 10000 words then our voacb will be high. Suppose 1000 and then we will have 1000 vectors for one-hot encoding. It is not a wise approach. So, we do vectorize our word columns using word embedding

# 3)-Word Embedding

Here we will predict next word having first two words in sequence

- Predict the probability of a word based on the words around it

In [0]:
# Context is the number of words we are using as a context for the next word we want to predict
CONTEXT_SIZE = 2

# Embedding dimension is the size of the embedding vector
EMBEDDING_DIM = 10

# Size of the hidden layer
HIDDEN_DIM = 256

In [0]:
test_sentence = """EV battery pack with battery cooling assembly and method:
An electric vehicle battery pack includes an array of battery cells each cell having an upper cell surface and a lower cell surface, the lower cell surface having a positive and a negative terminal; 
and a thermal assembly in thermally-conductive contact with the upper cell surfaces of the array. A battery pack cooling method is also disclosed.
"""

In [0]:
test_sentence=test_sentence.lower().split()

In [17]:
test_sentence[:5]

['ev', 'battery', 'pack', 'with', 'battery']

In [18]:
# Build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:5])

[(['ev', 'battery'], 'pack'), (['battery', 'pack'], 'with'), (['pack', 'with'], 'battery'), (['with', 'battery'], 'cooling'), (['battery', 'cooling'], 'assembly')]


In [0]:
vocab2 = list(set(test_sentence))
word_to_ix2 = {word: i for i, word in enumerate(vocab2)}

In [20]:
len(vocab2)

37

In [21]:
word_to_ix2

{'a': 11,
 'also': 13,
 'an': 27,
 'and': 25,
 'array': 35,
 'array.': 6,
 'assembly': 14,
 'battery': 23,
 'cell': 18,
 'cells': 34,
 'contact': 15,
 'cooling': 30,
 'disclosed.': 1,
 'each': 29,
 'electric': 8,
 'ev': 24,
 'having': 7,
 'in': 10,
 'includes': 2,
 'is': 33,
 'lower': 12,
 'method': 28,
 'method:': 9,
 'negative': 31,
 'of': 21,
 'pack': 36,
 'positive': 32,
 'surface': 26,
 'surface,': 0,
 'surfaces': 16,
 'terminal;': 17,
 'the': 19,
 'thermal': 4,
 'thermally-conductive': 20,
 'upper': 22,
 'vehicle': 5,
 'with': 3}

In [22]:
word_to_ix2['method']

28

# 4)-Training

In [0]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, HIDDEN_DIM)
        self.linear2 = nn.Linear(HIDDEN_DIM, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [0]:
learning_rate = 0.001
losses = []
loss_function = nn.NLLLoss()  # negative log likelihood
model = NGramLanguageModeler(len(vocab2), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

**process with steps**

https://github.com/PythonWorkshop/intro-to-nlp-with-pytorch/blob/master/Word%20Embeddings/Word%20Embeddings.ipynb

In [25]:
from tqdm import tqdm

for epoch in range(25):
    total_loss = 0

    iterator = tqdm(trigrams)
    for context, target in iterator:
        # (['When', 'forty'], 'winters')
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix2[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        iterator.set_postfix(loss=float(loss))
    losses.append(total_loss)
    # add progress bar with epochs

100%|██████████| 64/64 [00:00<00:00, 282.90it/s, loss=4.07]
100%|██████████| 64/64 [00:00<00:00, 286.89it/s, loss=4.03]
100%|██████████| 64/64 [00:00<00:00, 317.07it/s, loss=4]
100%|██████████| 64/64 [00:00<00:00, 317.24it/s, loss=3.96]
100%|██████████| 64/64 [00:00<00:00, 309.69it/s, loss=3.92]
100%|██████████| 64/64 [00:00<00:00, 316.05it/s, loss=3.88]
100%|██████████| 64/64 [00:00<00:00, 331.95it/s, loss=3.84]
100%|██████████| 64/64 [00:00<00:00, 320.33it/s, loss=3.81]
100%|██████████| 64/64 [00:00<00:00, 317.03it/s, loss=3.77]
100%|██████████| 64/64 [00:00<00:00, 330.28it/s, loss=3.73]
100%|██████████| 64/64 [00:00<00:00, 316.57it/s, loss=3.69]
100%|██████████| 64/64 [00:00<00:00, 287.85it/s, loss=3.66]
100%|██████████| 64/64 [00:00<00:00, 307.46it/s, loss=3.62]
100%|██████████| 64/64 [00:00<00:00, 315.98it/s, loss=3.58]
100%|██████████| 64/64 [00:00<00:00, 324.03it/s, loss=3.54]
100%|██████████| 64/64 [00:00<00:00, 323.50it/s, loss=3.51]
100%|██████████| 64/64 [00:00<00:00, 334.45

**Check the structure of our model**

In [26]:
model.eval()

NGramLanguageModeler(
  (embeddings): Embedding(37, 10)
  (linear1): Linear(in_features=20, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=37, bias=True)
)

**Finally checking output**

In [27]:
import numpy

with torch.no_grad():
    context = ['ev', 'battery']
    context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)
    pred = model(context_idxs)
    print(pred)
    index_of_prediction = numpy.argmax(pred)
    print(vocab2[index_of_prediction])

tensor([[-3.8903, -3.8992, -4.0499, -3.5640, -4.1671, -3.5255, -3.6053, -3.6632,
         -3.9569, -4.0884, -3.8815, -3.1810, -3.8464, -4.0280, -3.6784, -3.7553,
         -3.5279, -4.3367, -3.3683, -3.3198, -3.8685, -3.3507, -3.6641, -3.3333,
         -4.3641, -2.7671, -3.6448, -3.7213, -4.0266, -3.7237, -3.2755, -4.0852,
         -3.9153, -3.7027, -3.9384, -3.7946, -2.4018]])
pack


In [28]:
with torch.no_grad():
    context = ['battery', 'cooling']
    context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)
    pred = model(context_idxs)
    print(pred)
    index_of_prediction = numpy.argmax(pred)
    print(vocab2[index_of_prediction])

tensor([[-3.6538, -4.0866, -3.4565, -3.1560, -3.8800, -3.9365, -3.7660, -4.1569,
         -3.9080, -4.2016, -4.0053, -3.0585, -3.6159, -3.9280, -3.3109, -3.7641,
         -3.9187, -4.0829, -2.6884, -3.3106, -4.1795, -3.6176, -3.6277, -2.7057,
         -4.5562, -3.1354, -3.5569, -3.3007, -3.6466, -3.8102, -3.4049, -4.2331,
         -4.0830, -4.1373, -4.0375, -3.6279, -3.5538]])
cell
