In [None]:
!pip3 install transformers

In [74]:
!pip3 install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[K     |████████████████████████████████| 17.0 MB 7.7 MB/s eta 0:00:01
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3


In [56]:
!pip3 install -U scikit-learn scipy matplotlib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 5.1 MB/s eta 0:00:01
[?25hCollecting scipy
  Downloading scipy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 476 kB/s eta 0:00:01
[?25hCollecting matplotlib
  Downloading matplotlib-3.6.3-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 4.7 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.1.1
  Downloading jo

In [1]:
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu


In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-distilroberta-v1') # you can change the model here
model = AutoModel.from_pretrained('sentence-transformers/all-distilroberta-v1')

In [3]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

In [4]:
# initialize dictionary to store tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=512,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

In [5]:
# reformat list of tensors into single tensor
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [8]:
with torch.no_grad():
    outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [9]:
embeddings = outputs.last_hidden_state
embeddings.shape

torch.Size([4, 512, 768])

In [10]:
embeddings

tensor([[[-0.1716, -0.5348,  0.1297,  ..., -0.2978, -0.8623,  0.2598],
         [-0.8746,  0.4587, -0.1806,  ..., -0.4621, -0.9410,  0.3315],
         [-0.7475,  0.2020, -0.1778,  ..., -0.2076, -0.7047,  0.1703],
         ...,
         [ 0.2735, -0.1972,  0.0317,  ..., -0.1275, -0.5828,  0.0128],
         [ 0.2735, -0.1972,  0.0317,  ..., -0.1275, -0.5828,  0.0128],
         [ 0.2735, -0.1972,  0.0317,  ..., -0.1275, -0.5828,  0.0128]],

        [[ 0.3531, -1.0245,  0.1890,  ..., -1.0193,  0.1151,  0.1824],
         [ 0.3335, -0.9922, -0.2278,  ..., -1.1686,  0.0174,  0.3293],
         [ 0.9329, -0.3477,  0.6729,  ..., -1.2724,  0.1017,  0.6909],
         ...,
         [ 0.3406, -0.8891,  0.1748,  ..., -1.0470,  0.0745,  0.1729],
         [ 0.3406, -0.8891,  0.1748,  ..., -1.0470,  0.0745,  0.1729],
         [ 0.3406, -0.8891,  0.1748,  ..., -1.0470,  0.0745,  0.1729]],

        [[-0.0592, -0.3647,  0.0537,  ..., -0.6969, -1.0904,  0.3171],
         [-0.2030, -0.5321,  0.1347,  ..., -0

After we have produced our dense vectors embeddings, we need to perform a mean pooling operation to create a single vector encoding (the sentence embedding). To do this mean pooling operation, we will need to multiply each value in our embeddings tensor by its respective attention_mask value — so that we ignore non-real tokens.

In [11]:
# resize our attention_mask tensor:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([4, 512])

In [12]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([4, 512, 768])

Each vector above represents a single token attention mask - each token now has a vector of size 768 representing it's attention_mask status. Then we multiply the two tensors to apply the attention mask:

In [13]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([4, 512, 768])

"Mean Pooling" starts

In [14]:
# Then we sum the remained of the embeddings along axis 1, because we want to reduce the 512 tokens to 1 dimension
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([4, 768])

we want to count only those values that we want to give attention
then divide by the sum to get the mean

In [15]:
# clamp returns the same tensor with a range given, clamp is used to replace the zeros to a very minimal value
# to avoid divide by zero error
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([4, 768])

Finally, we calculate the mean as the sum of the embedding activations summed divided by the number of values that should be given attention in each position `summed_mask`:

In [16]:
mean_pooled = summed / summed_mask

`mean_pooled` is the final "dense representation" of the sentences, note that mean_pooled contains all representations for all sentences together

In [17]:
mean_pooled

tensor([[-0.2456, -0.3451,  0.1488,  ..., -0.2959, -0.8097,  0.2824],
        [ 0.2976, -0.9414,  0.0986,  ..., -1.0383,  0.0247,  0.2000],
        [-0.1320, -0.3896,  0.1188,  ..., -0.6403, -0.8874,  0.2990],
        [-0.3522, -0.9329, -0.0272,  ...,  0.1638, -0.9243,  0.3892]])

## Cosine Similarity

In [19]:
def convert_to_embedding(query):
    tokens = {'input_ids': [], 'attention_mask': []}
    new_tokens = tokenizer.encode_plus(query, max_length=512,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
    attention_mask = tokens['attention_mask']
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask
    summed = torch.sum(masked_embeddings, 1)
    summed_mask = torch.clamp(mask.sum(1), min=1e-9)
    mean_pooled = summed / summed_mask
    
    return mean_pooled[0] # assuming query is a single sentence
    

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
query = "Nemo is a fish"
query_embedding = convert_to_embedding(query)

In [25]:
mean_pooled.shape

torch.Size([4, 768])

In [26]:
cos = torch.nn.CosineSimilarity()
sim = cos(query_embedding, mean_pooled)
sim

tensor([0.1189, 0.2486, 0.1139, 0.1814])

# FAISS

In [27]:
import faiss                   # make faiss available
index = faiss.IndexFlatIP(768)   # build the index
print(index.is_trained)
index.add(mean_pooled)                  # add vectors to the index
print(index.ntotal)

True
4


In [31]:
mean_pooled.shape

torch.Size([4, 768])

In [32]:
query_embedding.shape

torch.Size([768])

In [35]:
D, I = index.search(query_embedding[None, :], 1) # None dimension is added because we only have one query against 4 documents

In [36]:
D

array([[35.04268]], dtype=float32)

In [37]:
I

array([[1]])

In [41]:
faiss.write_index(index,"sample_code.index")

In [42]:
index_loaded = faiss.read_index("sample_code.index")

In [43]:
D, I = index_loaded.search(query_embedding[None, :], 4)

In [44]:
D

array([[35.04268 , 26.346306, 17.326878, 14.138208]], dtype=float32)

In [45]:
I

array([[1, 3, 0, 2]])