In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n" + d.page_content for i, d in enumerate(docs)]))

model = 'flax-sentence-embeddings/all_datasets_v3_roberta-large'
embedding_function = HuggingFaceEmbeddings(model_name=model)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain.schema import Document

# Load blog post
from langchain.document_loaders import TextLoader

loader = TextLoader("/home/mind/projects/projects/document_retrieval_system/vectordb.txt")
data = loader.load()

docs = data
print(len(docs))

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=30)
docs = text_splitter.split_documents(docs)
print(len(docs))

In [None]:
vector1 = embedding_function.embed_query("How is the weather?")
vector2 = embedding_function.embed_query("Suggest some people for a Python project.")

data_vectors = [embedding_function.embed_query(doc.page_content) for doc in docs]
print(len(data_vectors))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np

cosine_sims_1 = [cosine_similarity([vector1], [data_vector])[0][0] for data_vector in data_vectors]
cosine_sims_2 = [cosine_similarity([vector2], [data_vector])[0][0] for data_vector in data_vectors]

x = np.arange(len(data_vectors))

plt.scatter(x, cosine_sims_1, label='How is the weather?', alpha=0.7)
plt.scatter(x, cosine_sims_2, label='Suggest some people for a Python project.', alpha=0.7)

plt.ylabel('Cosine Similarity')
plt.title('Consine Similarity between query and data vectors')
plt.legend()

plt.show()

In [None]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [None]:
import chromadb
import chromadb.config

from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

child_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=30)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=30)
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=embedding_function
)
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

In [None]:
retriever.add_documents(docs, ids=None)

In [None]:
vectorstore.similarity_search("Suggest some people for a Python, PHP and Angular project.")

In [None]:
with open('retrieved_docs_parent_child.txt', 'w') as file:
    file.write(str(retriever.get_relevant_documents("Suggest some people for a Node and Angular project.")))

In [None]:
print(type(vectorstore))

Ensemble Retriever

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever


bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 2

chroma_vectorstore = Chroma.from_documents(docs, embedding_function)
chroma_retriever = chroma_vectorstore.as_retriever(search_type="mmr", search_kwargs={'k': 15, 'fetch_k': 20, 'lambda_mult': 0.25})

ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, chroma_retriever], weights=[0.5, 0.5]
)

In [None]:
docs = ensemble_retriever.get_relevant_documents(query="Suggest some people for a Python, Node and Angular project.")
with open('retrieved_docs_ensemble.txt', 'w') as file:
    file.write(str(docs))

Time-weighted vector store retriever

In [None]:
import faiss

from datetime import datetime, timedelta
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain.schema import Document
from langchain.vectorstores import FAISS