# Deep Learning NLP With PyTorch

In [2]:
# Import 
import numpy as np 
import torch 
import torch.nn as nn 
import torch.nn.functional as F 

In [3]:
# Versions
%reload_ext watermark
%watermark -a "Data'snow" --iversions

autopep8 1.4.4
numpy    1.18.1
torch    1.5.1
json     2.0.9
Data'snow


## What are Sentences and Tokens

In [4]:
#Example

sentence = "a veloz raposa marrom saltou sobre o cachorro preguiçoso"
sentence = sentence.split()
sentence

['a',
 'veloz',
 'raposa',
 'marrom',
 'saltou',
 'sobre',
 'o',
 'cachorro',
 'preguiçoso']

In [9]:
# Creating Vocabulary from the sentence

vocab = {}

# For each token in the setence we save in our vocabulary
for token in sentence:
    if vocab.get(token) is None:
        vocab[token] = len(vocab) 

vocab

{'a': 0,
 'veloz': 1,
 'raposa': 2,
 'marrom': 3,
 'saltou': 4,
 'sobre': 5,
 'o': 6,
 'cachorro': 7,
 'preguiçoso': 8}

In [12]:
# Mapping to search eacth token in the sequence
sentence_transformed = list(map(vocab.get, sentence))
sentence_transformed

[0, 1, 2, 3, 4, 5, 6, 7, 8]

## One Hot Encoding

In [20]:
len(vocab)

9

In [21]:
# Creating One Hot Encoding
one_hot = np.eye(len(vocab), dtype=np.int)
one_hot

array([[1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [23]:
print(vocab['a'], vocab['raposa'])

0 2


In [26]:
# One-hot enconding to token "a"
print("\nO Indice do Token a no vocabulario:", vocab['a'])
print("O Vetor One-Hot Encoding:", one_hot[vocab['a']])


O Indice do Token a no vocabulario: 0
O Vetor One-Hot Encoding: [1 0 0 0 0 0 0 0 0]


In [27]:
# One-hot enconding to token "raposa"
print("\nO Indice do Token raposa no vocabulario:", vocab['raposa'])
print("O Vetor One-Hot Encoding:", one_hot[vocab['raposa']])


O Indice do Token raposa no vocabulario: 2
O Vetor One-Hot Encoding: [0 0 1 0 0 0 0 0 0]


Using the Statistical Semantics Hypothesis we can from a word transform its meaning in a numerical vector. And similar vector has probably the same meaning, what means that the distance between to vectors will be less.

We can measure the distance using Cosine similarity

In [28]:
# Function to measure the distance
def cos_similarity(x,y):
    return (np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))).round(5)

In [30]:
# Tokens in Sentence
print(sentence)

['a', 'veloz', 'raposa', 'marrom', 'saltou', 'sobre', 'o', 'cachorro', 'preguiçoso']


In [31]:
# One-hot matrix
print(one_hot)

[[1 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 1]]


In [33]:
# Defining two words as one-hot vector

palavra1 = one_hot[vocab['raposa']]
palavra2 = one_hot[vocab['cachorro']]

print(palavra1, palavra2)

[0 0 1 0 0 0 0 0 0] [0 0 0 0 0 0 0 1 0]


In [34]:
# Calculate Similarity using Cosine Similarity
cos_similarity(palavra1, palavra2)

0.0

As expected the distance is zero. Because the vectors used are sparse, we need transform to density vectors. Using:

$$w_i = t_i \cdot W , \quad where\ t_i = [0, \cdots, \underset{i\text{-}th\ index}{1}, \cdots, 0],\ len(t_i)=\vert V \vert$$

In [35]:
# Example

# Vector Space Shape
dim = 5

# Creating matrix (vector space) with random values from our vocabulary
W = np.random.rand(len(vocab), dim).round(3)

In [36]:
print("\nVector Space to all tokens, initial values of weights")
print(W)


Vector Space to all tokens, initial values of weights
[[0.173 0.565 0.755 0.56  0.572]
 [0.037 0.409 0.967 0.994 0.685]
 [0.068 0.875 0.22  0.391 0.498]
 [0.597 0.844 0.717 0.868 0.934]
 [0.524 0.909 0.417 0.332 0.331]
 [0.572 0.008 0.341 0.587 0.677]
 [0.517 0.348 0.029 0.391 0.292]
 [0.6   0.389 0.342 0.834 0.439]
 [0.693 0.024 0.439 0.471 0.467]]


In [38]:
# Transforming one-hot in density matrix (tokens: raposa and cachorro)

resultado1 = np.dot(palavra1, W)
print(resultado1)

resultado2 = np.dot(palavra2, W)
print(resultado2)

[0.068 0.875 0.22  0.391 0.498]
[0.6   0.389 0.342 0.834 0.439]


In [40]:
# Calculating similarity

similarity1 = cos_similarity(resultado1, resultado2)
print("\nThe similarity between raposa and cachorro:", similarity1)


The similarity between raposa and cachorro: 0.73615


## Using PyTorch to Calculate Similarity: Embedding

In [41]:
# Obtém os índices das palavras que queremos comparar
idxes = [np.argmax(v) for v in [palavra1, palavra2]]
print('\nÍndices das palavras "raposa" e "cachorro" é {}'.format(idxes))


Índices das palavras "raposa" e "cachorro" é [2, 7]


In [45]:
embed_layer = nn.Embedding(len(vocab), dim, _weight = torch.FloatTensor(W))  
idxes_tensor = torch.LongTensor(idxes)
embeded = embed_layer(idxes_tensor)

In [46]:
# Values weights of embeded layer
print(embed_layer.weight)

Parameter containing:
tensor([[0.1730, 0.5650, 0.7550, 0.5600, 0.5720],
        [0.0370, 0.4090, 0.9670, 0.9940, 0.6850],
        [0.0680, 0.8750, 0.2200, 0.3910, 0.4980],
        [0.5970, 0.8440, 0.7170, 0.8680, 0.9340],
        [0.5240, 0.9090, 0.4170, 0.3320, 0.3310],
        [0.5720, 0.0080, 0.3410, 0.5870, 0.6770],
        [0.5170, 0.3480, 0.0290, 0.3910, 0.2920],
        [0.6000, 0.3890, 0.3420, 0.8340, 0.4390],
        [0.6930, 0.0240, 0.4390, 0.4710, 0.4670]], requires_grad=True)


In [47]:
# Vector Embeded to 'raposa' and 'cachorro'
print(embeded)

tensor([[0.0680, 0.8750, 0.2200, 0.3910, 0.4980],
        [0.6000, 0.3890, 0.3420, 0.8340, 0.4390]], grad_fn=<EmbeddingBackward>)


In [49]:
# Applying Embedding to one-hot vectors
palavra1 = embed_layer.weight[vocab.get('raposa')]
palavra2 = embed_layer.weight[vocab.get('cachorro')]

In [52]:
# Calculate Similarity
similarity2 = F.cosine_similarity(palavra1, palavra2, dim=0)
print(similarity2)

tensor(0.7361, grad_fn=<DivBackward0>)


## Comparing Both Methods

In [53]:
print(similarity1, similarity2)

0.73615 tensor(0.7361, grad_fn=<DivBackward0>)
