# 01. Conventional RAG based on LangChain

In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from langchain_teddynote import logging

logging.langsmith(project_name="Paper-Agent")

LangSmith 추적을 시작합니다.
[프로젝트명]
Paper-Agent


In [4]:
# 단계 1: 문서 로드(Load Documents)
loader = PyMuPDFLoader(
    "../data/Retrieval Augmented Generation for Knowledge Intensive NLP Tasks.pdf"
)
docs = loader.load()
print(f"문서의 페이지수: {len(docs)}")

문서의 페이지수: 19


In [5]:
docs[0].__dict__

{'id': None,
 'metadata': {'producer': 'pdfTeX-1.40.21',
  'creator': 'LaTeX with hyperref',
  'creationdate': '2021-04-13T00:48:38+00:00',
  'source': '../data/Retrieval Augmented Generation for Knowledge Intensive NLP Tasks.pdf',
  'file_path': '../data/Retrieval Augmented Generation for Knowledge Intensive NLP Tasks.pdf',
  'total_pages': 19,
  'format': 'PDF 1.5',
  'title': '',
  'author': '',
  'subject': '',
  'keywords': '',
  'moddate': '2021-04-13T00:48:38+00:00',
  'trapped': '',
  'modDate': 'D:20210413004838Z',
  'creationDate': 'D:20210413004838Z',
  'page': 0},
 'page_content': 'Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,\nMike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research; ‡University College London; ⋆New York University;\nplewis@fb.com\nAbstract\nLarge pre-trained language mod

In [6]:
docs[0].metadata

{'producer': 'pdfTeX-1.40.21',
 'creator': 'LaTeX with hyperref',
 'creationdate': '2021-04-13T00:48:38+00:00',
 'source': '../data/Retrieval Augmented Generation for Knowledge Intensive NLP Tasks.pdf',
 'file_path': '../data/Retrieval Augmented Generation for Knowledge Intensive NLP Tasks.pdf',
 'total_pages': 19,
 'format': 'PDF 1.5',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'moddate': '2021-04-13T00:48:38+00:00',
 'trapped': '',
 'modDate': 'D:20210413004838Z',
 'creationDate': 'D:20210413004838Z',
 'page': 0}

In [7]:
# print(docs[0].page_content)

In [8]:
# 단계 2: 문서 분할(Split Documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)
print(f"분할된 청크의수: {len(split_documents)}")

분할된 청크의수: 158


In [9]:
split_documents[0].__dict__

{'id': None,
 'metadata': {'producer': 'pdfTeX-1.40.21',
  'creator': 'LaTeX with hyperref',
  'creationdate': '2021-04-13T00:48:38+00:00',
  'source': '../data/Retrieval Augmented Generation for Knowledge Intensive NLP Tasks.pdf',
  'file_path': '../data/Retrieval Augmented Generation for Knowledge Intensive NLP Tasks.pdf',
  'total_pages': 19,
  'format': 'PDF 1.5',
  'title': '',
  'author': '',
  'subject': '',
  'keywords': '',
  'moddate': '2021-04-13T00:48:38+00:00',
  'trapped': '',
  'modDate': 'D:20210413004838Z',
  'creationDate': 'D:20210413004838Z',
  'page': 0},
 'page_content': 'Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,\nMike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research; ‡University College London; ⋆New York University;\nplewis@fb.com\nAbstract\nLarge pre-trained language mod

In [10]:
# 단계 3: 임베딩(Embedding) 생성
embeddings = OpenAIEmbeddings()

In [11]:
embeddings.__dict__

{'client': <openai.resources.embeddings.Embeddings at 0x11693f810>,
 'async_client': <openai.resources.embeddings.AsyncEmbeddings at 0x117c09350>,
 'model': 'text-embedding-ada-002',
 'dimensions': None,
 'deployment': 'text-embedding-ada-002',
 'openai_api_version': None,
 'openai_api_base': None,
 'openai_api_type': None,
 'openai_proxy': None,
 'embedding_ctx_length': 8191,
 'openai_api_key': SecretStr('**********'),
 'openai_organization': None,
 'allowed_special': None,
 'disallowed_special': None,
 'chunk_size': 1000,
 'max_retries': 2,
 'request_timeout': None,
 'headers': None,
 'tiktoken_enabled': True,
 'tiktoken_model_name': None,
 'show_progress_bar': False,
 'model_kwargs': {},
 'skip_empty': False,
 'default_headers': None,
 'default_query': None,
 'retry_min_seconds': 4,
 'retry_max_seconds': 20,
 'http_client': None,
 'http_async_client': None,
 'check_embedding_ctx_length': True}

In [12]:
# 단계 4: DB 생성(Create DB) 및 저장
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)

In [13]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x12018ff50>

In [14]:
print("embedding_function: ", vectorstore.embedding_function)
print("distance_strategy: ", vectorstore.distance_strategy)
print("override_relevance_score_fn: ", vectorstore.override_relevance_score_fn)
print("normalize_L2: ", vectorstore._normalize_L2)

embedding_function:  client=<openai.resources.embeddings.Embeddings object at 0x11693f810> async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x117c09350> model='text-embedding-ada-002' dimensions=None deployment='text-embedding-ada-002' openai_api_version=None openai_api_base=None openai_api_type=None openai_proxy=None embedding_ctx_length=8191 openai_api_key=SecretStr('**********') openai_organization=None allowed_special=None disallowed_special=None chunk_size=1000 max_retries=2 request_timeout=None headers=None tiktoken_enabled=True tiktoken_model_name=None show_progress_bar=False model_kwargs={} skip_empty=False default_headers=None default_query=None retry_min_seconds=4 retry_max_seconds=20 http_client=None http_async_client=None check_embedding_ctx_length=True
distance_strategy:  DistanceStrategy.EUCLIDEAN_DISTANCE
override_relevance_score_fn:  None
normalize_L2:  False


In [15]:
# 단계 5: 검색기(Retriever) 생성
retriever = vectorstore.as_retriever()

In [16]:
retriever.__dict__

{'name': None,
 'tags': ['FAISS', 'OpenAIEmbeddings'],
 'metadata': None,
 'vectorstore': <langchain_community.vectorstores.faiss.FAISS at 0x12018ff50>,
 'search_type': 'similarity',
 'search_kwargs': {}}

In [17]:
# 단계 6: 프롬프트 생성(Create Prompt)
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Answer in Korean.

#Context: 
{context}

#Question:
{question}

#Answer:"""
)

In [18]:
# 단계 7: 언어모델(LLM) 생성
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

In [19]:
# 단계 8: 체인(Chain) 생성
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [20]:
chain

{
  context: VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x12018ff50>, search_kwargs={}),
  question: RunnablePassthrough()
}
| PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \nAnswer in Korean.\n\n#Context: \n{context}\n\n#Question:\n{question}\n\n#Answer:")
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x1227af5d0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x1227b4310>, root_client=<openai.OpenAI object at 0x120364e50>, root_async_client=<openai.AsyncOpenAI object at 0x1203f4190>, model_name='gpt-4o', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'))
| Str

In [21]:
# 체인 실행(Run Chain)
question = "이 논문의 주제가 무엇이야?"
response = chain.invoke(question)
print(response)

모르겠어요.


In [22]:
# 체인 실행(Run Chain)
question = "이 논문의 저자가 누구야?"
response = chain.invoke(question)
print(response)

이 논문의 저자는 Eunsol Choi, Daniel Hewlett, Jakob Uszkoreit, Illia Polosukhin, Alexandre Lacoste, Jonathan Berant, Ethan Perez, Siddharth Karamcheti, Rob Fergus, Jason Weston, Douwe Kiela, Kyunghyun Cho, Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Matthew Kelcey, Jacob Devlin, Kenton Lee, Kristina N. Toutanova, Llion Jones, Ming-Wei Chang, Andrew Dai, Quoc Le, Slav Petrov입니다.


In [23]:
# 체인 실행(Run Chain)
question = "이 논문의 Abstract 설명해줘."
response = chain.invoke(question)
print(response)

모르겠습니다.


## Result

- 단순하게 만들어진 chain으로는 원하는 결과나 나오지 않음

- Retrieval 부분과 프롬프트에 문제가 있을것으로 예상하고 이 부분에 대한 수정을 할 계획임