In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("outputs/synthetic_out.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,query,answers,evidence,page_number
0,0,What are three factual statements about the pu...,1) The work is published under the responsibil...,This work is published under the responsibilit...,4
1,1,Which entity does Turkiye recognise as the Tur...,Turkiye recognises the Turkish Republic of Nor...,Note by the Republic of Turkiye\n\nThe informa...,4
2,2,What is the ISBN for the PDF version of the OE...,ISBN 978-92-64-57606-3 (PDF),content : ISBN 978-92-64-57606-3 (PDF) \nISBN...,4
3,3,List three licensing and arbitration condition...,1) It is an adaptation of an original work by ...,Adaptations- you must cite the original work a...,4
4,4,What are three atomic facts that can be extrac...,The page includes the header 'Table of content...,I 3\n\nI 3\n \n\nTable of contents\n=========...,5
...,...,...,...,...,...
102,102,What are the 3 2050 targets described in the d...,"- By 2050, governance in G20 emerging-market e...","economies by 2050, 2) improvements in governan...",23
103,103,What is the title of the document on page 23?,1) Title: FINDING THE RIGHT BALANCE IN UNCERTA...,FINDING THE RIGHT BALANCE IN UNCERTAIN TIMES ¬©...,23
104,104,What are three key facts shown on page 24 of t...,Fact 1: Document title is OECD Economic Outloo...,"OECD Economic Outlook, Interim Report\n=======...",24
105,105,What is expected for global growth after the f...,Global growth was more resilient than anticipa...,Global growth is expected to moderate as the f...,24


In [7]:
from dotenv import load_dotenv
load_dotenv()

from langchain_openai import ChatOpenAI, OpenAIEmbeddings

llm = ChatOpenAI(model_name="gpt-5-nano", temperature=0)


In [9]:
import pickle
from langchain.schema import Document
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.vectorstores import FAISS


In [49]:
with open("outputs/split_documents.pkl", "rb") as f:
    split_documents = pickle.load(f)

In [50]:
split_documents[0]



In [56]:
def format_context(results: list[Document]) -> str:
    arr = []
    for i, doc in enumerate(results):
        text = f"""Retrieved #{i+1}
                    {doc.page_content}
                    {doc.metadata['text_summary'] if len(doc.metadata['text_summary']) > 0 else ''}
                    {doc.metadata['image_summary'] if len(doc.metadata['image_summary']) > 0 else ''}
                    chunk_id: {doc.metadata['id']}
                    page number: {doc.metadata['page']}
                    
                """
        arr.append(text)
    return "\n".join(arr)

In [51]:
from pydantic import BaseModel, Field
class RetrievalResult(BaseModel):
    """Retrieval result with details."""
    query: str = Field(..., description="The query of the data")
    answers: str = Field(..., description="The answers of the data")
    chunk_id: str = Field(..., description="The chunk id of the evidence")
    page_number: str = Field(..., description="The page number of the evidence")
    evidence: str = Field(..., description="The evidence of the data")

In [60]:
with open("outputs/split_documents.pkl", "rb") as f:
    split_documents = pickle.load(f)
vectorstore = FAISS.load_local(
    "faiss_index", 
    OpenAIEmbeddings(),
    allow_dangerous_deserialization=True  # needed in newer versions
)
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
bm25_retriever = BM25Retriever.from_documents(
    split_documents,
)
bm25_retriever.k = 1
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=[0.7, 0.3],
)
from typing import Annotated, TypedDict
from langgraph.graph.message import add_messages

# GraphState ÏÉÅÌÉú Ï†ïÏùò
class GraphState(TypedDict):
    question: Annotated[str, "Question"]  # ÏßàÎ¨∏
    context: Annotated[str, "Context"]  # Î¨∏ÏÑúÏùò Í≤ÄÏÉâ Í≤∞Í≥º
    answer: Annotated[RetrievalResult, "Answer"]  # ÎãµÎ≥Ä
    messages: Annotated[list, add_messages]  # Î©îÏãúÏßÄ(ÎàÑÏ†ÅÎêòÎäî list)
# Î¨∏ÏÑú Í≤ÄÏÉâ ÎÖ∏Îìú
def retrieve_document(state: GraphState) -> GraphState:
    # ÏßàÎ¨∏ÏùÑ ÏÉÅÌÉúÏóêÏÑú Í∞ÄÏ†∏ÏòµÎãàÎã§.
    latest_question = state["question"]

    # Î¨∏ÏÑúÏóêÏÑú Í≤ÄÏÉâÌïòÏó¨ Í¥ÄÎ†®ÏÑ± ÏûàÎäî Î¨∏ÏÑúÎ•º Ï∞æÏäµÎãàÎã§.
    retrieved_docs = ensemble_retriever.invoke(latest_question)

    # Í≤ÄÏÉâÎêú Î¨∏ÏÑúÎ•º ÌòïÏãùÌôîÌï©ÎãàÎã§.(ÌîÑÎ°¨ÌîÑÌä∏ ÏûÖÎ†•ÏúºÎ°ú ÎÑ£Ïñ¥Ï£ºÍ∏∞ ÏúÑÌï®)
    context = format_context(retrieved_docs)

    # Í≤ÄÏÉâÎêú Î¨∏ÏÑúÎ•º context ÌÇ§Ïóê Ï†ÄÏû•Ìï©ÎãàÎã§.
    return {"context": context}


# ÎãµÎ≥Ä ÏÉùÏÑ± ÎÖ∏Îìú
def llm_answer(state: GraphState) -> GraphState:
    # ÏßàÎ¨∏ÏùÑ ÏÉÅÌÉúÏóêÏÑú Í∞ÄÏ†∏ÏòµÎãàÎã§.
    latest_question = state["question"]

    # Í≤ÄÏÉâÎêú Î¨∏ÏÑúÎ•º ÏÉÅÌÉúÏóêÏÑú Í∞ÄÏ†∏ÏòµÎãàÎã§.
    context = state["context"]
   
    llm = ChatOpenAI(model_name="gpt-5-mini", temperature=0)
      
    system_prompt = """You are an assistant for question-answering tasks. 
        Use the following pieces of retrieved context to answer the question. 
        If you don't know the answer, just say that you don't know. 
        Answer in Korean.
    """
    
    prompt = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": "Context: " + context},
        {"role": "user", "content": "Question: " + latest_question},
    ]
        
    # Ï≤¥Ïù∏ÏùÑ Ìò∏Ï∂úÌïòÏó¨ ÎãµÎ≥ÄÏùÑ ÏÉùÏÑ±Ìï©ÎãàÎã§.
    model_with_structure = llm.with_structured_output(RetrievalResult)
    response = model_with_structure.invoke(prompt)
    print("response", response)
    # ÏÉùÏÑ±Îêú ÎãµÎ≥Ä, (Ïú†Ï†ÄÏùò ÏßàÎ¨∏, ÎãµÎ≥Ä) Î©îÏãúÏßÄÎ•º ÏÉÅÌÉúÏóê Ï†ÄÏû•Ìï©ÎãàÎã§.
    return {
        "answer": response,
        "messages": [("user", latest_question), ("assistant", response)],
    }
from langgraph.graph import END, StateGraph
from langgraph.checkpoint.memory import MemorySaver

# Í∑∏ÎûòÌîÑ ÏÉùÏÑ±
workflow = StateGraph(GraphState)

# ÎÖ∏Îìú Ï†ïÏùò
workflow.add_node("retrieve", retrieve_document)
workflow.add_node("llm_answer", llm_answer)

# Ïó£ÏßÄ Ï†ïÏùò
workflow.add_edge("retrieve", "llm_answer")  # Í≤ÄÏÉâ -> ÎãµÎ≥Ä
workflow.add_edge("llm_answer", END)  # ÎãµÎ≥Ä -> Ï¢ÖÎ£å

# Í∑∏ÎûòÌîÑ ÏßÑÏûÖÏ†ê ÏÑ§Ï†ï
workflow.set_entry_point("retrieve")

# Ï≤¥ÌÅ¨Ìè¨Ïù∏ÌÑ∞ ÏÑ§Ï†ï
memory = MemorySaver()

# Ïª¥ÌååÏùº
app = workflow.compile(checkpointer=memory)


In [61]:
from langchain_core.runnables import RunnableConfig
from langchain_teddynote.messages import invoke_graph, stream_graph, random_uuid

# config ÏÑ§Ï†ï(Ïû¨Í∑Ä ÏµúÎåÄ ÌöüÏàò, thread_id)
config = RunnableConfig(recursion_limit=20, configurable={"thread_id": random_uuid()})

# ÏßàÎ¨∏ ÏûÖÎ†•
inputs = GraphState(question="ÌÑ∞ÌÇ§Í∞Ä ÏßÅÎ©¥Ìïú ÏúÑÍ∏∞ÏôÄ ÎèÑÏ†ÑÏùÑ ÏïåÎ†§Ï£ºÏÑ∏Ïöî.")

# Í∑∏ÎûòÌîÑ Ïã§Ìñâ
invoke_graph(app, inputs, config)


üîÑ Node: [1;36mretrieve[0m üîÑ
- - - - - - - - - - - - - - - - - - - - - - - - - 
[1;32mcontext[0m:
Retrieved #1
                    The Interim Report says that countries need to find ways of engaging co‚Äëoperatively within the global trading system  
and working together to make trade policy more transparent and predictable. Central banks should remain vigilant, but  
can lower policy interest rates where underlying inflation is projected to moderate towards target, provided inflation  
expectations remain well anchored. Fiscal discipline is needed to safeguard longer‚Äëterm debt sustainability and allow  
governments to react to future shocks. Enhanced structural reform efforts are required to durably improve living  
standards and help realise the potential from new technologies, such as artificial intelligence.

  

The Interim Report is an update on the assessment in the June 2025 of the Economic Outlook (Volume 2025 Issue 1).

PDF ISBN 978-92-64-57606-3

  

9HSTCQE\*fh

ValidationError: 11 validation errors for AIMessage
content.str
  Input should be a valid string [type=string_type, input_value=RetrievalResult(query='...ficial intelligence.\''), input_type=RetrievalResult]
    For further information visit https://errors.pydantic.dev/2.12/v/string_type
content.list[union[str,dict[any,any]]].0.str
  Input should be a valid string [type=string_type, input_value=('query', 'ÌÑ∞ÌÇ§Í∞Ä ÏßÅÎ©¥Ìïú ÏúÑÍ∏∞ÏôÄ ÎèÑÏ†Ñ'), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/string_type
content.list[union[str,dict[any,any]]].0.dict[any,any]
  Input should be a valid dictionary [type=dict_type, input_value=('query', 'ÌÑ∞ÌÇ§Í∞Ä ÏßÅÎ©¥Ìïú ÏúÑÍ∏∞ÏôÄ ÎèÑÏ†Ñ'), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/dict_type
content.list[union[str,dict[any,any]]].1.str
  Input should be a valid string [type=string_type, input_value=('answers', 'Í≤ÄÏÉâÎêú ...Î°ú Ï†úÏãúÎê©ÎãàÎã§.'), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/string_type
content.list[union[str,dict[any,any]]].1.dict[any,any]
  Input should be a valid dictionary [type=dict_type, input_value=('answers', 'Í≤ÄÏÉâÎêú ...Î°ú Ï†úÏãúÎê©ÎãàÎã§.'), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/dict_type
content.list[union[str,dict[any,any]]].2.str
  Input should be a valid string [type=string_type, input_value=('chunk_id', '3;108'), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/string_type
content.list[union[str,dict[any,any]]].2.dict[any,any]
  Input should be a valid dictionary [type=dict_type, input_value=('chunk_id', '3;108'), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/dict_type
content.list[union[str,dict[any,any]]].3.str
  Input should be a valid string [type=string_type, input_value=('page_number', '4;24'), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/string_type
content.list[union[str,dict[any,any]]].3.dict[any,any]
  Input should be a valid dictionary [type=dict_type, input_value=('page_number', '4;24'), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/dict_type
content.list[union[str,dict[any,any]]].4.str
  Input should be a valid string [type=string_type, input_value=('evidence', 'Chunk 3 (pa...ficial intelligence.\''), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/string_type
content.list[union[str,dict[any,any]]].4.dict[any,any]
  Input should be a valid dictionary [type=dict_type, input_value=('evidence', 'Chunk 3 (pa...ficial intelligence.\''), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.12/v/dict_type