### RAG document search using OpenAI

In [1]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import OnlinePDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

In [2]:
import os
os.environ['OPENAI_API_KEY'] = 'your_openai_key'


In [3]:
pdfreader = PdfReader(r"C:\Users\ajayk\Desktop\OReilly Definitive Guide_delta_lake.pdf")

In [4]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [5]:
raw_text



In [6]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [7]:
len(texts)

279

In [8]:
embeddings = OpenAIEmbeddings()

  warn_deprecated(


In [38]:
document_search = Chroma.from_texts(texts, embeddings)

In [10]:
document_search

<langchain_community.vectorstores.chroma.Chroma at 0x1dfaec22100>

In [11]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI


In [36]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

  warn_deprecated(


In [41]:
query = "what is a delta lake explain in detail"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

'\nDelta Lake is a storage layer that provides ACID transactions, scalable metadata handling, and unified streaming and batch data processing in a single solution. It is built on top of Apache Spark and is designed to address common data lake challenges such as data reliability, scalability, and performance.\n\nAt its core, Delta Lake is a transaction log that stores all the changes made to the data lake. This log is structured in a way that allows Delta Lake to implement the principle of atomicity, which means that either all the changes in a transaction are applied or none of them are. This ensures data consistency and reliability, even in the event of failures.\n\nDelta Lake uses a single source of truth approach, which means that all changes to the data lake are recorded in the transaction log, and this log is the only source of truth for the data lake. This allows Delta Lake to easily compute the state of each table, using the transaction log to catch up from the most recent check

### RAG using llama2

In [14]:
embedding1 = OllamaEmbeddings(model_name="llama2")
#embedding1=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),

In [25]:
embedding1

OllamaEmbeddings(base_url='http://localhost:11434', model='llama2', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

## MultiQueryRetriever
The MultiQueryRetriever is likely used to enhance the retrieval process by enabling multiple queries to be executed, thereby improving the accuracy and relevance of the retrieved information. This can be particularly useful in applications such as:

1. Question Answering Systems
2. Chatbots
3. Search Engines

In [15]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [31]:
vector_db = Chroma.from_texts(
    texts=texts, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)


OllamaEmbeddings:   0%|                                                                        | 0/279 [00:00<?, ?it/s][A
OllamaEmbeddings:   0%|▏                                                               | 1/279 [00:04<19:09,  4.14s/it][A
OllamaEmbeddings:   1%|▍                                                               | 2/279 [00:06<15:10,  3.29s/it][A
OllamaEmbeddings:   1%|▋                                                               | 3/279 [00:09<13:52,  3.02s/it][A
OllamaEmbeddings:   1%|▉                                                               | 4/279 [00:13<14:41,  3.20s/it][A
OllamaEmbeddings:   2%|█▏                                                              | 5/279 [00:16<14:48,  3.24s/it][A
OllamaEmbeddings:   2%|█▍                                                              | 6/279 [00:18<13:39,  3.00s/it][A
OllamaEmbeddings:   3%|█▌                                                              | 7/279 [00:21<12:46,  2.82s/it][A
OllamaEmbedding

In [23]:
local_model = "llama2"
llm = ChatOllama(model=local_model)

In [32]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [33]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [34]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [35]:
chain.invoke("What is delta lake?")


OllamaEmbeddings:   0%|                                                                          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.36s/it][A

OllamaEmbeddings:   0%|                                                                          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.13s/it][A

OllamaEmbeddings:   0%|                                                                          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.18s/it][A

OllamaEmbeddings:   0%|                                                                          | 0/1 [00:00<?, ?it/s][A
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.16s/it][A

OllamaEmbed

"Delta Lake is an open-source, distributed data storage system designed to handle large-scale data processing and analytics workloads. It is a follow-up project to Apache Spark's Structured Streaming, which provides real-time data processing capabilities. Delta Lake aims to address some of the challenges associated with handling large amounts of data in a distributed environment, such as data consistency, fault tolerance, and scalability.\n\nAt its core, Delta Lake is a data storage system that uses a log-structured merge tree (LSM) to organize and store data. It stores data in Parquet format, which provides efficient compression and data processing capabilities. Delta Lake also supports various data sources, including Apache Hive, Apache Cassandra, and Amazon S3.\n\nSome key features of Delta Lake include:\n\n1. Transactional storage: Delta Lake provides transactional storage, which means that it stores data in a way that ensures data consistency and accuracy. This is achieved through