In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
import faiss

from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [38]:
class Embeddings:
    #CLS is a special classification token and the last hidden state of BERT Embedding
    def cls_pooling(self, model_output):
        return model_output.last_hidden_state[:, 0]

    #BERT tokenizer of input text
    def get_embeddings(self, text_list):
        encoded_input = tokenizer(
            text_list, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        model_output = model(**encoded_input)
        return self.cls_pooling(model_output).cpu().detach().numpy()
    
    
    #convert dataset into embeddings dataset to run FAISS
    def makeEmbeddings(self,dataset):
        embeddings = []
        for data in dataset:
            embeddings.append(self.get_embeddings(data)[0])
        return np.array(embeddings)
    
    def getQueryEmbedding(self, query):
        return self.get_embeddings([query])
    
class Faiss:
    def __init__(self):
        pass

    def faiss(self,xb):
        d = xb[0].size
        M = 32
        index = faiss.IndexHNSWFlat(d, M)            
        index.hnsw.efConstruction = 40         # Setting the value for efConstruction.
        index.hnsw.efSearch = 16               # Setting the value for efSearch.
        index.add(xb)
        return index
    
    def query(self,index,xq,k=3):
        D, I = index.search(xq, k)   
        return D, I

In [56]:
values = ["julia is super nice","julia loves pie","isabelle is happy"]
embeddings_dataset = Embeddings().makeEmbeddings(values)
xb = embeddings_dataset
xq = Embeddings().getQueryEmbedding("isa is happy")
index = Faiss().faiss(xb)
D,I = Faiss().query(index,xq)
I

array([[2, 0, 1]])

array([[ 0.05169334, -0.26670387, -0.39721736, ..., -0.05143736,
         0.14770423, -0.25271532],
       [-0.1921795 , -0.22556634, -0.46003044, ..., -0.12975267,
         0.05429002, -0.42701638],
       [ 0.10342234, -0.39489412, -0.43086833, ...,  0.19120945,
         0.15889496, -0.33146405]], dtype=float32)

In [22]:
xb = np.array([[ 0.2858461 ,  2.291011  ,  1.1027378 , -1.3654897 ,  0.50570506],
       [-0.43984768,  1.707121  , -0.33323628, -1.3266757 ,  0.0403869 ],
       [ 1.2801098 , -0.38431922, -0.89034826,  0.26452443,  0.01639184],
       [ 0.787185  ,  0.47547337, -0.2487884 ,  0.93552494,  0.02594745]])
index = Faiss().faiss(xb)
xq = np.array([[ 1.5905408 ,  0.75759834,  1.3138535 , -0.5175146 ,  0.2957387 ]])
D,I = Faiss().query(index,xq)
I


array([[0, 3, 2]])

In [34]:
from openai import OpenAI
from openai import OpenAI
import pandas as pd
import numpy as np

key = "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"
# Initialize OpenAI client (replace '...' with your API key)
client = OpenAI(api_key=key)


class OpenAIEmbeddings:
    def __init__(self,api_key):
        self.client = OpenAI(api_key=api_key)
        
    
    def get_embeddings(self, text_list):
        data = self.client.embeddings.create(input=text_list, model='text-embedding-ada-002').data
        embeddings = [embedding.embedding for embedding in data]
        return embeddings
    
    
    #convert dataset into embeddings dataset to run FAISS
    def makeEmbeddings(self,dataset):
        embeddings = self.get_embeddings(dataset)
        return np.array(embeddings)
    
    def getQueryEmbedding(self, query):
        return self.get_embeddings([query])
    
e = OpenAIEmbeddings(key)
e.makeEmbeddings(["hi"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


array([[-0.03503197, -0.02060164, -0.01537573, ..., -0.01162699,
        -0.00087646,  0.00465802]])

In [52]:
values = ["julia is super nice","julia loves pie","isabelle is happy", "hey baby"]
embeddings_dataset = OpenAIEmbeddings(key).makeEmbeddings(values)
xb = embeddings_dataset
xq = OpenAIEmbeddings(key).getQueryEmbedding("hey babe")
index = Faiss().faiss(xb)
D,I = Faiss().query(index,np.array(xq))
I

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


array([[3, 0, 2]])

array([[2, 0, 1]])

In [42]:
D,I = Faiss().query(index,xq)

AssertionError: 

In [43]:
xq.shape

(1, 768)