In [1]:
import os
import sys
from langchain.docstore.document import Document

from typing import List
from rank_bm25 import BM25Okapi
import numpy as np

In [2]:
os.environ["OPENAI_API_KEY"] = "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"
# os.environ["TAVILY_API_KEY"] = ""
os.environ["PINECONE_API_KEY"] = "145b5cf4-4171-4382-bf92-d2708662e0f7"


In [3]:
path = "../data/Understanding_Climate_Change.pdf"


In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
# from langchain.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore

  from tqdm.autonotebook import tqdm


In [68]:
## Helper Functions
def replace_n_with_space(list_of_documents):
    """
    Replaces all tab characters ('\n') with spaces in the page content of each document.

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\n', ' ')  # Replace tabs with spaces
    return list_of_documents

def show_context(context):
    """
    Display the contents of the provided context list.

    Args:
        context (list): A list of context items to be displayed.

    Prints each context item in the list with a heading indicating its position.
    """
    for i, c in enumerate(context):
        print(f"Context {i + 1}:")
        print(c)
        print("\n")

In [27]:
def encode_pdf_and_get_split_documents(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_n_with_space(texts)
    # Create embeddings and vector store
    # embeddings = OpenAIEmbeddings()
    # vectorstore = FAISS.from_documents(cleaned_texts, embeddings)
    vectorstore = PineconeVectorStore(index_name="gen-ai", embedding=OpenAIEmbeddings(model="text-embedding-3-large"))

    vectorstore.add_documents(documents=cleaned_texts, namespace="fusion_retrival_vectors")
    # retriever = vector_store.as_retriever()


    return vectorstore, cleaned_texts

In [28]:
vectorstore, cleaned_texts = encode_pdf_and_get_split_documents(path)


In [18]:
cleaned_texts[0].page_content

'Understanding Climate Change   Chapter 1: Introduction to Climate Change   Climate change refers to significant, long -term changes in the global climate. The term  "global climate" encompasses the planet\'s overall weather patterns, including temperature,  precipitation, and wind patterns, over an extended period. Over the past cent ury, human  activities, particularly the burning of fossil fuels and deforestation, have significantly  contributed to climate change.   Historical Context   The Earth\'s climate has changed throughout history. Over the past 650,000 years, there have  been seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about  11,700 years ago marking the beginning of the modern climate era and  human civilization.  Most of these climate changes are attributed to very small variations in Earth\'s orbit that  change the amount of solar energy our planet receives. During the Holocene epoch, which'

In [29]:
def create_bm25_index(documents: List[Document]) -> BM25Okapi:
    """
    Create a BM25 index from the given documents.

    BM25 (Best Matching 25) is a ranking function used in information retrieval.
    It's based on the probabilistic retrieval framework and is an improvement over TF-IDF.

    Args:
    documents (List[Document]): List of documents to index.

    Returns:
    BM25Okapi: An index that can be used for BM25 scoring.
    """
    # Tokenize each document by splitting on whitespace
    # This is a simple approach and could be improved with more sophisticated tokenization
    tokenized_docs = [doc.page_content.split() for doc in documents]
    return BM25Okapi(tokenized_docs)


In [30]:
bm25 = create_bm25_index(cleaned_texts) # Create BM25 index from the cleaned texts (chunks)


In [None]:
vectorstore.similarity_search_with_score

97

In [None]:
def fusion_retrieval(vectorstore, bm25, query: str, k: int = 5, alpha: float = 0.5) -> List[Document]:
    """
    Perform fusion retrieval combining keyword-based (BM25) and vector-based search.

    Args:
    vectorstore (VectorStore): The vectorstore containing the documents.
    bm25 (BM25Okapi): Pre-computed BM25 index.
    query (str): The query string.
    k (int): The number of documents to retrieve.
    alpha (float): The weight for vector search scores (1-alpha will be the weight for BM25 scores).

    Returns:
    List[Document]: The top k documents based on the combined scores.
    """
    # Step 1: Get all documents from the vectorstore
    index_stats = vectorstore._index.describe_index_stats()
    total_count = index_stats["namespaces"].get('fusion_retrival_vectors', {}).get("vector_count", 0)
    all_docs = vectorstore.similarity_search("", k=total_count)

    print('all_docs', all_docs)
    
    # Step 2: Perform BM25 search
    bm25_scores = bm25.get_scores(query.split())
    print('bm25_scores', bm25_scores)
    print('bm25_scores', len(bm25_scores))
    # # Step 3: Perform vector search
    vector_results = vectorstore.similarity_search_with_score(query, namespace='fusion_retrival_vectors', k=total_count)
    
    print('vector_results', vector_results)
    # # Step 4: Normalize scores
    vector_scores = np.array([score for _, score in vector_results])
    vector_scores = 1 - (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores))

    bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))


    print("vector_scroes", vector_scores)
    print("bm25_scores",bm25_scores)
    
    # # Step 5: Combine scores
    combined_scores = alpha * vector_scores + (1 - alpha) * bm25_scores  
    print("combined_scores",combined_scores)
    # # Step 6: Rank documents
    sorted_indices = np.argsort(combined_scores)[::-1]
    print("sorted_indices",sorted_indices)
    # # Step 7: Return top k documents
    return [all_docs[i] for i in sorted_indices[:k]]

In [None]:
query = "What are the impacts of climate change on the environment?"

# Perform fusion retrieval
top_docs = fusion_retrieval(vectorstore, bm25, query, k=5, alpha=0.5)
print('top_docs:', top_docs)
docs_content = [doc.page_content for doc in top_docs]
show_context(docs_content)


all_docs [Document(id='55665bc3-33e2-461d-bbb1-88d531fd24f5', metadata={'doc_id': '3624ece1-38b6-4450-9d3e-32076b64a58b'}, page_content='The document discusses the concept of building autonomous agents powered by LLM (large language model) controllers. It covers key components such as planning, memory, and tool use, with examples of proof-of-concept demos like AutoGPT and GPT-Engineer. Challenges such as finite context length, reliability of natural language interface, and long-term planning are also highlighted. The document provides insights into the potential of LLM-powered agents and showcases case studies like a Scientific Discovery Agent and Generative Agents Simulation. It also includes references and citations for further reading.'), Document(id='a8f29bc2-afff-40c8-9747-94ccef70227e', metadata={'doc_id': 'df81c93f-ccaf-4bb7-95d1-82d1b6785a13'}, page_content='The document discusses the importance of high-quality human data for training deep learning models. It covers topics such

TypeError: 'NoneType' object is not iterable