In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
loader = PDFPlumberLoader("./data/diabetes draft7.pdf")
docs = loader.load()

In [4]:
docs[5].metadata

{'source': './data/diabetes draft7.pdf',
 'file_path': './data/diabetes draft7.pdf',
 'page': 5,
 'total_pages': 6,
 'Author': '@¬8',
 'CreationDate': "D:20240726144349+09'00'",
 'ModDate': "D:20240726144349+09'00'",
 'Producer': 'Microsoft: Print To PDF',
 'Title': 'diabetes draft7.hwp'}

In [5]:
import json
table_data = {
    "headers": ["Model", "ROC-AUC"],
    "rows": [
        {"Model": "ADA", "ROC-AUC": 0.51},
        {"Model": "LR", "ROC-AUC": 0.50},
        {"Model": "XGB", "ROC-AUC": 0.51},
        {"Model": "RF", "ROC-AUC": 0.50},
        {"Model": "KNN", "ROC-AUC": 0.50},
        {"Model": "SVC", "ROC-AUC": 0.50},
        {"Model": "BNB", "ROC-AUC": 0.54},
        {"Model": "Ridge", "ROC-AUC": 0.51},
        {"Model": "DT", "ROC-AUC": 0.54},
        {"Model": "GNB", "ROC-AUC": 0.62},
        {"Model": "DNN", "ROC-AUC": 0.81},
        {"Model": "RNN+CNN", "ROC-AUC": 0.83}
    ]
}
table_data_str = json.dumps(table_data)

In [6]:
docs[5].metadata['table'] = table_data_str

In [7]:
docs[5].metadata

{'source': './data/diabetes draft7.pdf',
 'file_path': './data/diabetes draft7.pdf',
 'page': 5,
 'total_pages': 6,
 'Author': '@¬8',
 'CreationDate': "D:20240726144349+09'00'",
 'ModDate': "D:20240726144349+09'00'",
 'Producer': 'Microsoft: Print To PDF',
 'Title': 'diabetes draft7.hwp',
 'table': '{"headers": ["Model", "ROC-AUC"], "rows": [{"Model": "ADA", "ROC-AUC": 0.51}, {"Model": "LR", "ROC-AUC": 0.5}, {"Model": "XGB", "ROC-AUC": 0.51}, {"Model": "RF", "ROC-AUC": 0.5}, {"Model": "KNN", "ROC-AUC": 0.5}, {"Model": "SVC", "ROC-AUC": 0.5}, {"Model": "BNB", "ROC-AUC": 0.54}, {"Model": "Ridge", "ROC-AUC": 0.51}, {"Model": "DT", "ROC-AUC": 0.54}, {"Model": "GNB", "ROC-AUC": 0.62}, {"Model": "DNN", "ROC-AUC": 0.81}, {"Model": "RNN+CNN", "ROC-AUC": 0.83}]}'}

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
documents = text_splitter.split_documents(docs)

In [9]:
len(documents)

10

In [10]:
documents[8].metadata

{'source': './data/diabetes draft7.pdf',
 'file_path': './data/diabetes draft7.pdf',
 'page': 5,
 'total_pages': 6,
 'Author': '@¬8',
 'CreationDate': "D:20240726144349+09'00'",
 'ModDate': "D:20240726144349+09'00'",
 'Producer': 'Microsoft: Print To PDF',
 'Title': 'diabetes draft7.hwp',
 'table': '{"headers": ["Model", "ROC-AUC"], "rows": [{"Model": "ADA", "ROC-AUC": 0.51}, {"Model": "LR", "ROC-AUC": 0.5}, {"Model": "XGB", "ROC-AUC": 0.51}, {"Model": "RF", "ROC-AUC": 0.5}, {"Model": "KNN", "ROC-AUC": 0.5}, {"Model": "SVC", "ROC-AUC": 0.5}, {"Model": "BNB", "ROC-AUC": 0.54}, {"Model": "Ridge", "ROC-AUC": 0.51}, {"Model": "DT", "ROC-AUC": 0.54}, {"Model": "GNB", "ROC-AUC": 0.62}, {"Model": "DNN", "ROC-AUC": 0.81}, {"Model": "RNN+CNN", "ROC-AUC": 0.83}]}'}

In [11]:
embedding = OpenAIEmbeddings(model='text-embedding-3-small')

vector_store = Chroma.from_documents(documents, embedding)


In [12]:
retriever = vector_store.as_retriever()

In [13]:
retriever.invoke("본 논문에서 제시하는 ada모델의 roc-auc값을 알려줘")

[Document(metadata={'Author': '@¬8', 'CreationDate': "D:20240726144349+09'00'", 'ModDate': "D:20240726144349+09'00'", 'Producer': 'Microsoft: Print To PDF', 'Title': 'diabetes draft7.hwp', 'file_path': './data/diabetes draft7.pdf', 'page': 2, 'source': './data/diabetes draft7.pdf', 'total_pages': 6}, page_content='. 기존연구\n. 분석 알고리즘\nFig3.ColumnsCorrelation\nFig2.KNNAlgorithm'),
 Document(metadata={'Author': '@¬8', 'CreationDate': "D:20240726144349+09'00'", 'ModDate': "D:20240726144349+09'00'", 'Producer': 'Microsoft: Print To PDF', 'Title': 'diabetes draft7.hwp', 'file_path': './data/diabetes draft7.pdf', 'page': 5, 'source': './data/diabetes draft7.pdf', 'table': '{"headers": ["Model", "ROC-AUC"], "rows": [{"Model": "ADA", "ROC-AUC": 0.51}, {"Model": "LR", "ROC-AUC": 0.5}, {"Model": "XGB", "ROC-AUC": 0.51}, {"Model": "RF", "ROC-AUC": 0.5}, {"Model": "KNN", "ROC-AUC": 0.5}, {"Model": "SVC", "ROC-AUC": 0.5}, {"Model": "BNB", "ROC-AUC": 0.54}, {"Model": "Ridge", "ROC-AUC": 0.51}, {"Model

In [14]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
template = """
당신은 주어진 context를 이용해 답변을 하는 qa챗봇입니다.
모르면 모른다고 답변해 주세요
모르면 모른다고 답변해 주세요
답변의 출처를 적어주세요
#Context
{context}
#Question
{question}
#Answer
"""
prompt = PromptTemplate.from_template(template)

llm = ChatOpenAI(model="gpt-4o-mini")

chain = ({"context": retriever, "question": RunnablePassthrough()}
         | prompt
         | llm
         | StrOutputParser()
         )



In [15]:
chain.invoke("본 논문에서 제시하는 ADA모델의 roc-auc 값은 얼마야?")

'본 논문에서 제시하는 ADA 모델의 ROC-AUC 값은 0.51입니다. \n\n출처: [diabetes draft7.pdf, 페이지 5]'

In [16]:
chain.invoke("본 논문에서 제시하는 모델들의 roc-auc 값들을 알려줘?")

'논문에서 제시하는 모델들의 ROC-AUC 값은 다음과 같습니다:\n\n- ADA: 0.51\n- LR: 0.5\n- XGB: 0.51\n- RF: 0.5\n- KNN: 0.5\n- SVC: 0.5\n- BNB: 0.54\n- Ridge: 0.51\n- DT: 0.54\n- GNB: 0.62\n- DNN: 0.81\n- RNN+CNN: 0.83\n\n출처: 문서에서 제공된 표 (페이지 5).'