In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset

from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [2]:
class Embedding:
    #CLS is a special classification token and the last hidden state of BERT Embedding
    def cls_pooling(self, model_output):
        return model_output.last_hidden_state[:, 0]

    #BERT tokenizer of input text
    def get_embeddings(self, text_list):
        encoded_input = tokenizer(
            text_list, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        model_output = model(**encoded_input)
        return self.cls_pooling(model_output).cpu().detach().numpy()

In [13]:


class Faiss:
    def __init__(self):
        pass
    
    #convert dataset into embeddings dataset to run FAISS
    def makeEmbeddings(self,dataset):
        embeddings_dataset = pd.DataFrame(
            {
                "embeddings":Embedding().get_embeddings(dataset).tolist(),
                "values":dataset
            })
        embeddings_dataset = Dataset.from_pandas(embeddings_dataset)
        return embeddings_dataset
    
    # run faiss model on dataset
    def faiss(self,embeddings_dataset):
        embeddings_dataset.add_faiss_index(column="embeddings")
    
    #get query embedding
    def getQueryEmbedding(self, query):
        return Embedding().get_embeddings([query])
        
    #predict 4 nearest neighbors
    def predict(self,query,embeddings_dataset,k=4):
        query_embedding = self.getQueryEmbedding(query)
        scores, samples = embeddings_dataset.get_nearest_examples("embeddings", query_embedding, k=5)
        samples = pd.DataFrame(samples)
        samples["scores"] = scores
        return samples
        

In [25]:
values = ["julia","vedha","isabelle"]
f = Faiss()
embeddings_dataset = f.makeEmbeddings(values)
f.faiss(embeddings_dataset)
f.predict("vedha",embeddings_dataset)

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,embeddings,values,scores
0,"[-0.15024018287658691, -0.44466474652290344, -...",vedha,5.551478e-11
1,"[0.046705227345228195, -0.22696268558502197, -...",julia,59.55457
2,"[0.15655478835105896, -0.3943764567375183, -0....",isabelle,64.21315


In [24]:
df = pd.read_csv("SEC-CompanyTicker.csv",index_col=0)
df = df.head(100)
dataset = df.companyName.tolist()
f = Faiss()
embeddings_dataset = f.makeEmbeddings(dataset)
f.faiss(embeddings_dataset)
f.predict("goldman",embeddings_dataset)

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,embeddings,values,scores
0,"[0.0548088364303112, 0.07416345179080963, -0.2...",Morgan Stanley,42.217228
1,"[0.17106111347675323, -0.053230009973049164, -...",Jpmorgan Chase & Co,44.135571
2,"[-0.14033766090869904, -0.257710337638855, -0....",Mastercard Inc,47.031788
3,"[0.1506667286157608, -0.266143798828125, -0.48...","Salesforce, Inc.",47.926239
4,"[-0.007676966488361359, -0.17204351723194122, ...",Wells Fargo & Company/Mn,48.350143
