In [2]:
# pip install -U sentence-transformers

In [17]:
from transformers import AutoTokenizer, AutoModel
import torch

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class BERTEmbeddings:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        self.model = AutoModel.from_pretrained(model_ckpt)
        
    #CLS is a special classification token and the last hidden state of BERT Embedding
    def cls_pooling(self, model_output):
        return model_output.last_hidden_state[:, 0]

    #BERT tokenizer of input text
    def getBERTEmbeddings(self, text_list):
        encoded_input = self.tokenizer(
            text_list, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        model_output = self.model(**encoded_input)
        return self.cls_pooling(model_output).cpu().detach().numpy()
    
train = BERTEmbeddings()
xb = train.getBERTEmbeddings(data)

In [26]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from dotenv import load_dotenv
import os
from openai import OpenAI


class Embeddings(BERTEmbeddings):
    def getSentenceTransformerEmbeddings(self,data):
        model = SentenceTransformer('all-MiniLM-L6-v2')
        return model.encode(data)
    def getOpenAIEmbeddings(self,data, api_key):
        self.client = OpenAI(api_key=api_key)
        values = self.client.embeddings.create(input=data, model='text-embedding-ada-002').data
        embeddings = [embedding.embedding for embedding in values]
        return np.array(embeddings)
    



In [39]:
import faiss

class Faiss:
    def __init__(self):
        pass

    def faiss(self,xb):
        d = xb[0].size
        M = 32
        index = faiss.IndexHNSWFlat(d, M)            
        index.hnsw.efConstruction = 40         # Setting the value for efConstruction.
        index.hnsw.efSearch = 16               # Setting the value for efSearch.
        index.add(xb)
        return index
    
    def query(self,index,xq,k=3):
        D, I = index.search(xq, k)   
        return D, I
    
def similaritySearch(index,xq,k=2):
    D,I = Faiss().query(index,xq,k=k)
    guesses = [[data[guess] for guess in i] for i in I]
    return guesses


In [40]:
quoraDataPath = "../../inputs/quora/train.csv"
quoraData = pd.read_csv(quoraDataPath)
quoraData = quoraData[quoraData['is_duplicate'] == 1]
data = list(quoraData.question1[:100])
q = list(quoraData.question2[1:3])

model = Embeddings()
if True:
    xb = model.getSentenceTransformerEmbeddings(data)
    xq = model.getSentenceTransformerEmbeddings(q)
    print("here")
if False: 
    load_dotenv()
    key = os.environ.get("OPENAI_KEY")
    xb = model.getOpenAIEmbeddings(data,key)
    xq = model.getOpenAIEmbeddings(q,key)
if False: 
    xb = model.getBERTEmbeddings(data)
    xq = model.getBERTEmbeddings(q) 
    
index = Faiss().faiss(xb)
similaritySearch(index,xq)

here


[['How can I be a good geologist?',
  'What are some of the high salary income jobs in the field of biotechnology?'],
 ['How do I read and find my YouTube comments?',
  'How do you get deleted Instagram chats?']]

In [41]:
q

['What should I do to be a great geologist?',
 'How can I see all my Youtube comments?']