In [26]:
import numpy as np
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

In [27]:
xb

array([[1.91519454e-01, 6.22108757e-01, 4.37727749e-01, ...,
        6.24916732e-01, 4.78093803e-01, 1.95675179e-01],
       [3.83317441e-01, 5.38736843e-02, 4.51648414e-01, ...,
        1.51395261e-01, 3.35174650e-01, 6.57551765e-01],
       [7.53425434e-02, 5.50063960e-02, 3.23194802e-01, ...,
        3.44416976e-01, 6.40880406e-01, 1.26205325e-01],
       ...,
       [1.00811470e+02, 5.90245306e-01, 7.98893511e-01, ...,
        3.39859009e-01, 3.01949501e-01, 8.53854537e-01],
       [1.00669464e+02, 9.16068792e-01, 9.55078781e-01, ...,
        5.95364332e-01, 3.84918079e-02, 1.05637990e-01],
       [1.00855637e+02, 5.91134131e-01, 6.78907931e-01, ...,
        2.18976989e-01, 6.53015897e-02, 2.17538327e-01]], dtype=float32)

In [28]:
d

64

In [29]:
import faiss  

#faiss using product quantization for data compression and HNSW for coarse quantizer
class Faiss:
    def __init__(self):
        self.index = faiss.IndexFlatL2(d)   # build the index
        print(self.index.is_trained)
        self.index.add(xb)                  # add vectors to the index
        print(self.index.ntotal)
    
    def search(self,queries, k=4): # want to see k nearest neighbors
        D, I = self.index.search(xb[:5], k)
        return I,D



    
f = Faiss()
f.search(xb[:5])

True
100000


(array([[  0, 393, 363,  78],
        [  1, 555, 277, 364],
        [  2, 304, 101,  13],
        [  3, 173,  18, 182],
        [  4, 288, 370, 531]]),
 array([[0.       , 7.1751733, 7.2076297, 7.2511625],
        [0.       , 6.323565 , 6.684581 , 6.799946 ],
        [0.       , 5.7964087, 6.3917365, 7.2815123],
        [0.       , 7.2779055, 7.527987 , 7.6628466],
        [0.       , 6.7638035, 7.295121 , 7.368815 ]], dtype=float32))

In [30]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)
D, I = index.search(xq, k)     # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:])                  # neighbors of the 5 last queries

[[  0 393 363  78]
 [  1 555 277 364]
 [  2 304 101  13]
 [  3 173  18 182]
 [  4 288 370 531]]
[[0.        7.1751733 7.2076297 7.2511625]
 [0.        6.323565  6.684581  6.799946 ]
 [0.        5.7964087 6.3917365 7.2815123]
 [0.        7.2779055 7.527987  7.6628466]
 [0.        6.7638035 7.295121  7.368815 ]]
[[ 381  207  210  477]
 [ 526  911  142   72]
 [ 838  527 1290  425]
 [ 196  184  164  359]
 [ 526  377  120  425]]
[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]


In [31]:
d = 32  # data dimension
cs = 4  # code size (bytes)

# train set 
nt = 10000
xt = np.random.rand(nt, d).astype('float32')

# dataset to encode (could be same as train)
n = 20000
x = np.random.rand(n, d).astype('float32')

pq = faiss.ProductQuantizer(d, cs, 8)
pq.train(xt)

# encode 
codes = pq.compute_codes(x)

# decode
x2 = pq.decode(codes)

# compute reconstruction error
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()

codes

array([[231, 211,  27,  19],
       [  5, 229, 242,   2],
       [177, 187,  52, 246],
       ...,
       [109, 104,  33,  32],
       [202, 115,  58, 236],
       [209, 158, 225, 115]], dtype=uint8)

In [40]:
import json
import pandas as pd
import numpy as np


def loadData():
    file_name = "company_tickers.json"
    with open(file_name, "r") as json_file:
        loaded_json_data = json.load(json_file)
    df = pd.DataFrame(loaded_json_data)
    df = df.transpose()
    df.title = df.title.str.title()
    df = df.rename({"title":"companyName"},axis=1)
#     df = df[~df.ticker.str.contains('-')]
#     df = df.drop_duplicates(subset='Company', keep='first')
    df.to_csv("SEC-CompanyTicker.csv")
    return df
df = loadData()
df

Unnamed: 0,cik_str,ticker,companyName
0,320193,AAPL,Apple Inc.
1,789019,MSFT,Microsoft Corp
2,1652044,GOOGL,Alphabet Inc.
3,1018724,AMZN,Amazon Com Inc
4,1045810,NVDA,Nvidia Corp
...,...,...,...
10893,1945711,LVROW,Lavoro Ltd
10894,1898795,LVWR-WT,"Livewire Group, Inc."
10895,1837344,MBTCR,Nocturne Acquisition Corp
10896,1837344,MBTCU,Nocturne Acquisition Corp


In [33]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [36]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]


def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [39]:
embedding = get_embeddings(["julia","hi"])
embedding.shape

torch.Size([2, 768])

In [85]:
df = df.head(100)

In [86]:

dataset = Dataset.from_pandas(df)

embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["companyName"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [87]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['cik_str', 'ticker', 'companyName', '__index_level_0__', 'embeddings'],
    num_rows: 100
})

In [103]:
question = "Coca"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [104]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [105]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [106]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.companyName}")
    print()

COMMENT: Caterpillar Inc

COMMENT: Alphabet Inc.

COMMENT: Walt Disney Co

COMMENT: Pepsico Inc

COMMENT: Coca Cola Co



In [97]:
samples_df

Unnamed: 0,cik_str,ticker,companyName,__index_level_0__,embeddings,scores
4,50863,INTC,Intel Corp,54,"[0.06933523714542389, -0.23722538352012634, -0...",18.959095
3,896878,INTU,Intuit Inc.,59,"[-0.07939024269580841, -0.45380669832229614, -...",18.394699
2,1108524,CRM,"Salesforce, Inc.",37,"[0.15066705644130707, -0.26614394783973694, -0...",17.476484
1,1730168,AVGO,Broadcom Inc.,21,"[0.024646926671266556, -0.6132085919380188, -0...",16.711088
0,320193,AAPL,Apple Inc.,0,"[0.04409850016236305, -0.3003808557987213, -0....",0.0


In [98]:
df

Unnamed: 0,cik_str,ticker,companyName
0,320193,AAPL,Apple Inc.
1,789019,MSFT,Microsoft Corp
2,1652044,GOOGL,Alphabet Inc.
3,1018724,AMZN,Amazon Com Inc
4,1045810,NVDA,Nvidia Corp
...,...,...,...
95,1075531,BKNG,Booking Holdings Inc.
96,829224,SBUX,Starbucks Corp
97,1668717,BUD,Anheuser-Busch Inbev Sa/Nv
98,947263,TD,Toronto Dominion Bank


In [None]:
class Faiss:
    def __init__(self):
        pass
    
    
    #convert dataset into embeddings dataset to run FAISS
    def makeEmbeddings(self,dataset){
        
    }
    
    # run faiss model on dataset
    def faiss(self,embeddings_dataset){
        
    }
    
    #get query embedding
    def getQueryEmbedding(self, query){
        
    }
    
    #predict 4 nearest neighbors
    def predict(self,query,k=4){
        
    }