In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_community.document_loaders import PyPDFLoader, PDFPlumberLoader, PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

## data loader

In [None]:
docs = PyPDFDirectoryLoader('data/RAG')

In [None]:
loader = [
    PyPDFLoader("data/RAG/Adaptive_RAG.pdf"),
    PyPDFLoader("data/RAG/Naive_RAG.pdf"),
    PyPDFLoader("data/RAG/RAPTOR_RAG.pdf"),
    PyPDFLoader("data/RAG/Self_RAG.pdf"),
]

In [None]:
docs = []

for loader in loader:
    docs.extend(loader.load())

len(docs)

In [None]:
# docs = []

# for loader in [
#     PDFPlumberLoader("data/RAG/Adaptive_RAG.pdf"),
#     PDFPlumberLoader("data/RAG/Naive_RAG.pdf"),
#     PDFPlumberLoader("data/RAG/RAPTOR_RAG.pdf"),
#     PDFPlumberLoader("data/RAG/Self_RAG.pdf"),
# ]:
#     docs.extend(loader.load())

# len(docs)

## text_splitter

In [None]:
text_splitters = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

documents = text_splitters.split_documents(docs)

In [None]:
len(documents)

## embeddings

In [None]:
api_key = "AIzaSyC9stRLfTxclPS0iZY5bvdZrYMBkAh-Vgw"

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", api_key=api_key)

In [None]:
embeddings.embed_query("안녕")

## retriever

In [None]:
# pip install rank_bm25
bm25_retriever = BM25Retriever.from_documents(documents)

In [None]:
bm25_retriever.invoke("raptor rag")

In [None]:
faiss_index = FAISS.from_documents(documents, embeddings)
faiss_retriever = faiss_index.as_retriever()

In [None]:
faiss_retriever.invoke("raptor rag")

In [None]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weight={0.4, 0.6}
)

In [None]:
ensemble_retriever.invoke("raptor rag")

## llm


In [3]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model='gpt-4o-mini')

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", api_key=api_key)

In [None]:
llm.invoke("안녕?").content

## prompt

In [None]:
template = """
#Question:
{question}
#Context:
{context}

#Answer:
"""

In [None]:
prompt = PromptTemplate.from_template(template)

In [None]:
prompt

## chain

In [None]:
chain = (
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
print(chain.invoke("RAPTOR rag 대해서 설명해주세요"))

## stream

In [None]:
answer = chain.stream("RAPTOR rag 대해서 설명해주세요")

for token in answer:
    print(token, end="", flush=True)

In [None]:
answer = chain.stream("RAPTOR rag 대해서 설명해주세요")
final_answer = ""
for token in answer:
    final_answer += token
    print(token, end="", flush=True)

In [None]:
from langchain_teddynote.messages import stream_response

answer = chain.stream("RAPTOR rag 대해서 설명해주세요")
stream_response(answer)

In [None]:
answer_llm = llm.stream("Raptor RAG에 대해서 설명해주세요")
for token in answer_llm:
    print(token.content, end="", flush=True)