In [1]:
import numpy as np
import pickle, os
import faiss

from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS # intergrates Langchain vectorstore with Faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [2]:
chunk_data_path = 'data/chunks/'
file_name = 'chunks_rec_text_split_cs300_co50_defaultsep.json'

with open(os.path.join(chunk_data_path, file_name), "rb") as fp:
    chunks = pickle.load(fp)

In [3]:
load_dotenv()

True

In [4]:
emb_model = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=os.getenv("GOOGLE_API_KEY")
        ) 

In [5]:
a = emb_model.embed_documents(["d", '3'])

In [11]:
len(chunks)

600

In [18]:
%%time
vectors = [emb_model.embed_query(doc.page_content) for doc in chunks]

CPU times: user 1.85 s, sys: 728 ms, total: 2.57 s
Wall time: 4min 10s


In [19]:
%%time
embedding_mat = np.array(vectors).astype("float16")

CPU times: user 39.1 ms, sys: 13 ms, total: 52.1 ms
Wall time: 52.1 ms


In [21]:
embedding_mat.shape

(600, 768)

In [22]:
embedding_mat

array([[ 0.04578 ,  0.02234 , -0.04556 , ...,  0.02011 , -0.02872 ,
         0.01984 ],
       [ 0.03488 , -0.0024  , -0.0512  , ...,  0.0226  , -0.0179  ,
         0.01403 ],
       [ 0.04944 ,  0.01929 , -0.061   , ...,  0.02817 , -0.04602 ,
         0.01826 ],
       ...,
       [ 0.05283 ,  0.04312 , -0.04785 , ...,  0.011375, -0.03928 ,
         0.01743 ],
       [ 0.0293  ,  0.02461 , -0.05212 , ...,  0.03604 , -0.03802 ,
        -0.00715 ],
       [ 0.0188  ,  0.005432, -0.0709  , ...,  0.02985 , -0.03766 ,
        -0.007412]], shape=(600, 768), dtype=float16)

In [48]:
emb_dim = len(emb_model.embed_documents(["test_query"])[0])
k = 3
n_cluster = 10 # ??? What if rule-of-thumb and why?

flat_index = faiss.IndexFlatL2(emb_dim)
ivf_index = faiss.IndexIVFFlat(faiss.IndexFlatL2(emb_dim), emb_dim, n_cluster)
ivf_index.train(embedding_mat)

flat_vector_store = FAISS(
    embedding_function=emb_model,
    index=flat_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id ={}
)
flat_vector_store.add_documents(documents=chunks)

ivf_vector_store = FAISS(
    embedding_function=emb_model,
    index=ivf_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id ={}
)
ivf_vector_store.add_documents(documents=chunks)

In [49]:
"""
search_type
- similarity: Cosine/L2
- MMR(Maximal Marginal Relevance): ???
- Similarity_score_threshold: Only returns documents above certain threshold.
 - pass in additional param "score_threshold"
"""

%%timeit
flat_vector_store.search("What are LG Energy solution's flagship products?"
                         , k=3
                         , search_type="similarity")

570 ms ± 50.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [54]:
path_to_vectorstore = "data/vector_stores/"
flat_vector_store.save_local(os.path.join(path_to_vectorstore, "flat"))

In [50]:
%%timeit
ivf_vector_store.search("What are LG Energy solution's flagship products?"
                        , k=3
                        , search_type="similarity")

480 ms ± 117 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [55]:
ivf_vector_store.save_local(os.path.join(path_to_vectorstore, "ivfFlat"))

In [52]:
"""
M = 32 # number of connections for each node.
ef_construction = 200 # size of dynamic candidate list

index_hnsw = faiss.IndexHNSWFlat(emb_dim, M)
index_hnsw.hnsw.efConstruction = ef_construction
index_hnsw.add(embedding_mat)

ef_search = 50 # ???
index_hnsw.search(query, k)
"""
M = 32
hnsw_index = faiss.IndexHNSWFlat(emb_dim, M)
hnsw_vector_store = FAISS(
    embedding_function=emb_model,
    index=hnsw_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id ={}
)
hnsw_vector_store.add_documents(documents=chunks)

In [56]:
hnsw_vector_store.save_local(os.path.join(path_to_vectorstore, "HNSW"))

In [53]:
%%timeit
hnsw_vector_store.search("What are LG Energy solution's flagship products?"
                        , k=3
                        , search_type="similarity")

453 ms ± 73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
