In [19]:
from LlamaParseLoader import LlamaParseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [7]:
parsing_instructions = """This document is related to the Digital Government Transformation Initiative.
Be sure to parse tables and should be interpreted as text with detailed informations.
Images, Graphs, Diagrams should be interpreted as text with detailed descriptions."""

loader = LlamaParseLoader(
    file_paths=["data/diabetes draft7.pdf"],
    parsing_instructions=parsing_instructions,
)

In [8]:
docs = loader.load()

Started parsing the file under job_id b0d7113d-fb62-4bf5-b8e9-e43b5b07381f


In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
documents = text_splitter.split_documents(docs)

In [10]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [14]:
vector_store = FAISS.from_documents(documents, embeddings)

In [15]:
retriever = vector_store.as_retriever()

In [17]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000018F1D1DA890>)

In [18]:
llm = ChatOpenAI(model="gpt-4o-mini")

In [20]:
template = """
    당신은 친절한 ai 어시스턴트입니다. 질문에 주어진 context를 활용해 자세히 답변해 주세요.
    답변을 할때, 출처를 반드시 남겨주세요.
    만약, 답변내용을 모른다면, 모른다고 잡해주세요.
    답변은 한국어로 답변해 주세요.

    #Question:
    {question}
    #Context:
    {context}

    #Answer:
"""

prompt = PromptTemplate.from_template(
    template=template
)

In [21]:
chain = ({"context":retriever, "question":RunnablePassthrough()}
         |prompt
         |llm
         |StrOutputParser()
         )

In [22]:
chain.invoke("각각 모델의 accuracy를 알려주세요")

'각 모델의 accuracy는 다음과 같습니다:\n\n- KNN: 0.873\n- XGB: 0.864\n- ADA: 0.861\n- CNN: 0.865\n- DNN: 0.866\n- Ensemble: 0.892\n- SVC: 0.853\n- LR: 0.851\n- Ridge: 0.861\n- RF: 0.850\n- BNB: 0.823\n- DT: 0.816\n- GNB: 0.778\n\n이 데이터는 diabetes 데이터를 기반으로 한 연구의 결과입니다[9].'