In [1]:
import os
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import JSONLoader
from dotenv import load_dotenv

# 추가된 import
from typing import TypedDict
from langchain_upstage import UpstageGroundednessCheck
from langgraph.graph import END, StateGraph
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.runnables import RunnableConfig
from langgraph.errors import GraphRecursionError
# .env 파일 로드
load_dotenv()


True

In [2]:
# OpenAI API 키를 환경 변수에서 로드
openai_api_key = os.getenv("OPENAI_API_KEY")
# JSON 파일 경로 설정
file_path = 'merged_data.json'


In [3]:
# JSONLoader의 jq_schema 설정
jq_schema = """
    .[] | {
        "filename": .filename,
        "date": .date,
        "conference_number": .conference_number,
        "question_number": .question_number,
        "meeting_name": .meeting_name,
        "generation_number": .generation_number,
        "committee_name": .committee_name,
        "meeting_number": .meeting_number,
        "session_number": .session_number,
        "agenda": .agenda,
        "law": .law,
        "qna_type": .qna_type,
        "context": .context,
        "context_summary": {
            "summary_q": .context_summary.summary_q,
            "summary_a": .context_summary.summary_a
        },
        "questioner": {
            "name": .questioner_name,
            "affiliation": .questioner_affiliation,
            "position": .questioner_position
        },
        "question": {
            "comment": .question.comment,
            "keyword": .question.keyword
        },
        "answerer": {
            "name": .answerer_name,
            "affiliation": .answerer_affiliation,
            "position": .answerer_position
        },
        "answer": {
            "comment": .answer.comment,
            "keyword": .answer.keyword
        }
    }
"""

# JSON 파일 로드
loader = JSONLoader(file_path, jq_schema=jq_schema, text_content=False)
documents = loader.load()

In [4]:
# 문서 분할 및 임베딩 생성
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)

# CUDA 사용 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 임베딩 모델 설정 (GPU 사용)
embedding_model_name = 'jhgan/ko-sroberta-multitask'
embedding_model = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': device}
)

# FAISS 인덱스 설정
index_path = 'faiss_index'

# VectorStore 생성 또는 로드
if os.path.exists(index_path):
    print("저장된 FAISS 인덱스를 로드합니다...")
    vectorstore = FAISS.load_local(
        index_path,
        embeddings=embedding_model,
        allow_dangerous_deserialization=True
    )
else:
    print("FAISS 인덱스를 생성합니다...")
    vectorstore = FAISS.from_documents(split_docs, embedding_model)
    vectorstore.save_local(index_path)

  embedding_model = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange


FAISS 인덱스를 생성합니다...


In [11]:
# Retriever 생성
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

# 그라운드 체크 기능 설정
upstage_ground_checker = UpstageGroundednessCheck()

# GraphState 클래스 정의
class GraphState(TypedDict):
    question: str       # 질문
    context: str        # 문서의 검색 결과
    answer: str         # 답변
    relevance: str      # 답변의 문서에 대한 관련성
    recursion_count: int  # 재귀 횟수

In [12]:

# 도구 함수 정의
def json_search_tool(input_text):
    docs = retriever.get_relevant_documents(input_text)
    if docs:
        summaries = [doc.page_content for doc in docs]
        return '\n\n'.join(summaries)
    else:
        return "해당하는 정보를 찾을 수 없습니다."


# Tool Definition
json_search_tool = Tool(
    name="JSONSearch",
    func=json_search_tool,
    description=(
        "Use this tool to search for information about a specific person or issue from the JSON document. "
        "Provides a summary of fields such as conference number, meeting name, generation number, committee name, agenda, law, Q&A type, context, learning context, "
        "context summary, questioner, and question."
    )
)


In [13]:
# 도구 목록 생성
tools = [json_search_tool]

# LLM 정의
llm = ChatOpenAI(model_name="gpt-4", temperature=0, openai_api_key=openai_api_key)

# 메모리 설정
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# 에이전트 초기화
agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    verbose=True,
    memory=memory,
)

In [14]:

# 시스템 프롬프트 설정
agent.agent.llm_chain.prompt.messages[0].prompt.template = (
    "Be sure to answer in Korean. "
    "You are an AI assistant that helps users find information about individuals or issues from the National Assembly JSON document. "
    "When a user asks about a specific person or issue, you should use the JSONSearch tool to find meeting information where that person is mentioned. "
    "Provide a summarized response including fields such as conference number, meeting name, generation number, committee name, agenda, law, Q&A type, context, learning context, context summary, questioner, and question. "
    "If you cannot find the information in the JSON document, politely inform the user that the information is not available. "
    "Do not mention the use of tools; just provide the necessary information."
)

In [15]:
# 그라운드 체크를 위한 함수 정의
def retrieve_document(state: GraphState) -> GraphState:
    # 재귀 횟수 증가 또는 초기화
    recursion_count = state.get('recursion_count', 0) + 1
    # 사용자의 질문에 따라 문서를 검색합니다.
    retrieved_docs = retriever.get_relevant_documents(state['question'])
    context = '\n\n'.join([doc.page_content for doc in retrieved_docs])
    # 기존의 'context'를 업데이트합니다.
    state.update({
        'context': context,
        'recursion_count': recursion_count
    })
    return state

def llm_answer(state: GraphState) -> GraphState:
    # 에이전트를 사용하여 답변을 생성합니다.
    answer = agent.run(input=state['question'])
    state.update({'answer': answer})
    return state

def relevance_check(state: GraphState) -> GraphState:
    # 그라운드 체크를 수행합니다.
    response = upstage_ground_checker.run(
        {"context": state['context'], "answer": state['answer']}
    )
    state.update({'relevance': response})
    return state

def is_relevant(state: GraphState) -> str:
    if state.get('relevance') == "grounded":
        return "관련성 O"
    elif state.get('relevance') == "notGrounded":
        return "관련성 X"
    else:
        return "확인불가"

# 그래프 워크플로우 정의
workflow = StateGraph(GraphState)

# 노드 추가
workflow.add_node("retrieve", retrieve_document)
workflow.add_node("llm_answer", llm_answer)
workflow.add_node("relevance_check", relevance_check)

# 엣지 연결
workflow.add_edge("retrieve", "llm_answer")
workflow.add_edge("llm_answer", "relevance_check")

# 조건부 엣지 설정
workflow.add_conditional_edges(
    "relevance_check",
    is_relevant,
    {
        "관련성 O": END,
        "관련성 X": "retrieve",
        "확인불가": "retrieve",
    },
)

# 시작점 설정
workflow.set_entry_point("retrieve")

# 메모리 저장소 설정
memory_saver = MemorySaver()

# 워크플로우 컴파일
app = workflow.compile(checkpointer=memory_saver)

# 에이전트와 대화하는 함수 수정
def chat_with_agent(user_input):
    # 초기 상태 설정
    state = GraphState(
        question=user_input,
        context='',
        answer='',
        relevance='',
        recursion_count=0
    )
    config = RunnableConfig(recursion_limit=20, configurable={"thread_id": "SELF-RAG"})
    output = app.invoke(state, config=config)
    recursion_count = output.get('recursion_count', 0)
    relevance = output.get('relevance', '')
    # 최종 답변에 재귀 횟수와 그라운드 여부를 포함
    final_answer = output.get('answer', '')
    final_answer += f"\n\n총 재귀 횟수: {recursion_count}"
    if relevance == "grounded":
        final_answer += "\n그라운드 여부: O (관련 있음)"
    elif relevance == "notGrounded":
        final_answer += "\n그라운드 여부: X (관련 없음)"
    else:
        final_answer += "\n그라운드 여부: 확인 불가"
    return final_answer

In [16]:


# 예시 질문
user_input = "김영삼 대통령이 언급된 회의에 대한 정보를 알려주세요. 반드시 안건 이름과 날짜, 어떤 회의였는지, 몇 대 국회였는지 알려주세요. 질문자와 답변자에 대한 정보와 어떤 안건에 대해 이야기가 오고 갔는지 대화문도 알려주세요."
response = chat_with_agent(user_input)
print("Assistant:", response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "JSONSearch",
    "action_input": "김영삼"
}
```[0m
Observation: [36;1m[1;3m1983\ub144, 1997\ub144 \ub2e4 \ud5c8\uac00 \ub9e1\uc740 \ub545\uc785\ub2c8\ub2e4, \uadf8\uac8c. \ub2e4\ub9cc \ub18d\uc9c0\uc5d0\uc11c \uacf5\uc7a5\uc6a9\uc9c0 \ubcc0\uacbd\ub418\uc9c0 \uc54a\uc740 \uac83 \uadf8\uac83\uc740 \uc9c0\uae08 \uacbd\uae30\ub3c4\uc5d0\uc11c \ud558\uace0 \uc788\uc5b4\uc694.   \ub0b4\uc6a9\uc744 \uc880 \uc544\uc2dc\uace0 \ub098\uc624\uc154\uc57c\u2026\u2026 \uc81c\uac00 \ud658\uacbd\ubd80\uc7a5\uad00\uc744 \ud588\uc5c8\uae30 \ub54c\ubb38\uc5d0 \uc774 \ubd80\ubd84\uc5d0 \ub300\ud574\uc11c\ub294 \uc790\uc138\ud558\uac8c \uc54c\uace0 \uc788\uc2b5\ub2c8\ub2e4. \uadf8\ub798\uc11c \uc77c\ub2e8 \uc800\ud76c\ub4e4\uc774\u2026\u2026 \uc544\ub2c8, \ud658\uacbd\ubd80\uc7a5\uad00\uc744 \ud558\uc2e0 \uac83\uc774 \uc544\ub2c8\ub77c \ud558\uc774\ub2c9\uc2a4\ubc18\ub3c4\uccb4\uc5d0 \ub300\ud574\uc11c \ub9d0\uc500\ub4dc\

In [17]:
# 예시 질문
user_input = "디테일한 대화내용에 대해 알려줘"
response = chat_with_agent(user_input)
print("Assistant:", response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "JSONSearch",
    "action_input": "김영삼"
}
```[0m
Observation: [36;1m[1;3m1983\ub144, 1997\ub144 \ub2e4 \ud5c8\uac00 \ub9e1\uc740 \ub545\uc785\ub2c8\ub2e4, \uadf8\uac8c. \ub2e4\ub9cc \ub18d\uc9c0\uc5d0\uc11c \uacf5\uc7a5\uc6a9\uc9c0 \ubcc0\uacbd\ub418\uc9c0 \uc54a\uc740 \uac83 \uadf8\uac83\uc740 \uc9c0\uae08 \uacbd\uae30\ub3c4\uc5d0\uc11c \ud558\uace0 \uc788\uc5b4\uc694.   \ub0b4\uc6a9\uc744 \uc880 \uc544\uc2dc\uace0 \ub098\uc624\uc154\uc57c\u2026\u2026 \uc81c\uac00 \ud658\uacbd\ubd80\uc7a5\uad00\uc744 \ud588\uc5c8\uae30 \ub54c\ubb38\uc5d0 \uc774 \ubd80\ubd84\uc5d0 \ub300\ud574\uc11c\ub294 \uc790\uc138\ud558\uac8c \uc54c\uace0 \uc788\uc2b5\ub2c8\ub2e4. \uadf8\ub798\uc11c \uc77c\ub2e8 \uc800\ud76c\ub4e4\uc774\u2026\u2026 \uc544\ub2c8, \ud658\uacbd\ubd80\uc7a5\uad00\uc744 \ud558\uc2e0 \uac83\uc774 \uc544\ub2c8\ub77c \ud558\uc774\ub2c9\uc2a4\ubc18\ub3c4\uccb4\uc5d0 \ub300\ud574\uc11c \ub9d0\uc500\ub4dc\