In [1]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API 키 정보 로드
load_dotenv()

True

In [2]:
# LangSmith 추적을 설정합니다. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# 프로젝트 이름을 입력
logging.langsmith("ChatBot_Project")

LangSmith 추적을 시작합니다.
[프로젝트명]
ChatBot_Project


In [3]:
import os

os.environ['LANGCHAIN_PROJECT'] = 'ChatBot_Project'

In [4]:
# 필요한 모듈 import
import os
import faiss
from tqdm import tqdm
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer  # Hugging Face 모델 사용
import torch
from langchain_openai import ChatOpenAI  # ChatOpenAI import
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import create_tool_calling_agent, AgentExecutor
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_teddynote.messages import AgentStreamParser
from langchain.document_loaders import JSONLoader
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.embeddings import OpenAIEmbeddings  # OpenAIEmbeddings 임포트

# 벡터 저장 및 불러오기 함수
def save_faiss_index(index, filename):
    faiss.write_index(index, filename)

def load_faiss_index(filename):
    return faiss.read_index(filename)

# 각 JSON 파일 경로
file_paths = {
    "국정감사": {
        "path": '국정감사.json',
        "description": "This file contains information and records related to the National Audit."
    },
    "본회의": {
        "path": '본회의.json',
        "description": "This file contains details and proceedings from the Plenary Session."
    },
    "소위원회": {
        "path": '소위원회.json',
        "description": "This file includes records and discussions from the Subcommittee meetings."
    },
    "예산결산특별위원회": {
        "path": '예산결산특별위원회.json',
        "description": "This file contains information regarding the Special Committee on Budget and Settlement."
    },
    "특별위원회": {
        "path": '특별위원회.json',
        "description": "This file contains records and details from the Special Committee meetings."
    }
}

# 영어로 매핑된 위원회 이름
committee_name_mapping = {
    "국정감사": "National_Audit",
    "본회의": "Plenary_Session",
    "소위원회": "Subcommittee",
    "예산결산특별위원회": "Budget_Special_Committee",
    "특별위원회": "Special_Committee"
}

# JSONLoader의 jq_schema 설정
jq_schema = """
.[] | {
    "filename": .filename,
    "original": .original,
    "id": .id,
    "date": .date,
    "conference_number": .conference_number,
    "question_number": .question_number,
    "meeting_name": .meeting_name,
    "generation_number": .generation_number,
    "committee_name": .committee_name,
    "meeting_number": .meeting_number,
    "session_number": .session_number,
    "agenda": .agenda,
    "law": .law,
    "qna_type": .qna_type,
    "context": .context,
    "context_learn": .context_learn,
    "context_summary": {
        "summary_q": .context_summary.summary_q,
        "summary_a": .context_summary.summary_a
    },
    "questioner_name": .questioner_name,
    "questioner_ID": .questioner_ID,
    "questioner_ISNI": .questioner_ISNI,
    "questioner_affiliation": .questioner_affiliation,
    "questioner_position": .questioner_position,
    "question": {
        "tag": .question.tag,
        "comment": .question.comment,
        "keyword": .question.keyword
    },
    "answerer_name": .answerer_name,
    "answerer_ID": .answerer_ID,
    "answerer_ISNI": .answerer_ISNI,
    "answerer_affiliation": .answerer_affiliation,
    "answerer_position": .answerer.position,
    "answer": {
        "tag": .answer.tag,
        "comment": .answer.comment,
        "keyword": .answer.keyword
    }
}
"""

# 각 파일에 대해 JSONLoader, 분할기, 벡터 저장소 및 도구 생성
retriever_tools = {}
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # GPU 사용 여부 확인

# OpenAIEmbeddings 객체 생성
embeddings = OpenAIEmbeddings()

for committee_name, file_info in tqdm(file_paths.items()):
    english_name = committee_name_mapping[committee_name]
    file_path = file_info["path"]
    
    # JSONLoader를 사용하여 파일 로드
    loader = JSONLoader(file_path, jq_schema=jq_schema, text_content=False)
    documents = loader.load()

    # 텍스트 분할기를 사용하여 문서를 분할
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    split_docs = text_splitter.split_documents(documents)

    # 저장할 벡터 저장소 파일 경로
    vectorstore_file = f'{english_name}_faiss_openai_index.index'

    # VectorStore가 이미 존재하는지 확인
    if os.path.exists(vectorstore_file):
        index = load_faiss_index(vectorstore_file)
        vector = FAISS(index=index, embedding_function=embeddings.embed_query, docstore=InMemoryDocstore({i: doc for i, doc in enumerate(split_docs)}))
    else:
        # FAISS를 통해 문서에서 벡터 저장소 생성
        vector = FAISS.from_documents(split_docs, embeddings)  # OpenAIEmbeddings 사용

        # FAISS 인덱스를 파일로 저장
        save_faiss_index(vector.index, vectorstore_file)

    # Retriever를 생성
    retriever = vector.as_retriever()

    # retriever_tool 정의
    retriever_tool = create_retriever_tool(
        retriever,
        name=f"{english_name}_search",
        description=f"use this tool to search information from the {committee_name} JSON document."
                    f"the {english_name} is related with meeting_name as JSON document's key."
                    f"{file_info['description']}"
    )

    # 도구를 딕셔너리에 저장
    retriever_tools[committee_name] = retriever_tool

### 1-3. tools 리스트에 도구 목록을 추가 ###
tools = list(retriever_tools.values())


########## 2. LLM 을 정의 ##########
llm = ChatOpenAI(model="gpt-4o", temperature=0)  # ChatOpenAI 모델 정의

  from tqdm.autonotebook import tqdm, trange
  embeddings = OpenAIEmbeddings()
100%|██████████| 5/5 [53:23<00:00, 640.76s/it]   


In [7]:
########## 3. Prompt 를 정의 ##########
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You should help the people who are asking you questions as a helpful assistant."
            "Make sure to use the `*_search` tools for searching information from the JSON document."
            "'*_search' tools are for the National Audit, the Plenary Session, the Subcommittee meetings, the Special Committee on Budget and Settlement and the Special Committee meetings."
            "all the JSON documents' contents are National Assembly Proceedings."
            "I am explaining information from JSON documents' keys."
            "date is the date when meeting_name is open. generation number is what number National Assembly it is."
            "context is conversation as raw data. context_learn is good for learning about the conversation."
            "summary_q is question's summary and summary_a is answer's summary."
            "When you search answers for questions, you should primarily check keys on JSON documents."
            "The keys are date, meeting_name, generation_number, committee_name, agenda, context_learn, context_summary, questioner name, answerer name, question and answer."
            "If a person asking a question has not defined the scope of that question, all JSON files which each tools have should be searched.",
        ),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("assistant", "{{response}}"),
        ("placeholder", "{agent_scratchpad}"),
    ]
)

########## 4. Agent 를 정의 ##########
agent = create_tool_calling_agent(llm, tools, prompt)

########## 5. AgentExecutor 를 정의 ##########
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)

########## 6. 채팅 기록을 수행하는 메모리를 추가 ##########
store = {}

def get_session_history(session_ids):
    if session_ids not in store:
        store[session_ids] = ChatMessageHistory()
    return store[session_ids]

agent_with_chat_history = RunnableWithMessageHistory(
    agent_executor,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
)

########## 7. Agent 파서를 정의 ##########
agent_stream_parser = AgentStreamParser()

In [19]:

########## 8. 에이전트를 실행하고 결과를 확인합니다. ##########

# 질의에 대한 답변을 출력합니다.
response = agent_with_chat_history.stream(
    {"input":  """ 그게 가장 최근이라고? """},
    # 세션 ID를 설정
    # 여기서는 간단한 메모리 내 ChatMessageHistory를 사용하기 때문에 실제로 사용되지 않음
    config={"configurable": {"session_id": "abc123"}},
)

for step in response:
    agent_stream_parser.process_agent_steps(step)

[도구 호출]
Tool: National_Audit_search
query: 21대 국회 국정감사 최근 회의
Log: 
Invoking: `National_Audit_search` with `{'query': '21대 국회 국정감사 최근 회의'}`



[관찰 내용]
Observation: {"filename": "SRC_21\ub300_2020_2020\ub14410\uc6d412\uc77c_\uad6d\uc815\uac10\uc0ac_\ud658\uacbd\ub178\ub3d9\uc704\uc6d0\ud68c_0009(050329).xlsx", "original": "http://likms.assembly.go.kr/record/mhs-10-040-0040.do?conferNum=050329&fileType=PDF", "id": "050329", "date": "2020\ub14410\uc6d412\uc77c(\uc6d4)", "conference_number": "050329", "question_number": "0009", "meeting_name": "\uad6d\uc815\uac10\uc0ac", "generation_number": "21", "committee_name": "\ud658\uacbd\ub178\ub3d9\uc704\uc6d0\ud68c", "meeting_number": "2020", "session_number": "2020\ub14410\uc6d412\uc77c", "agenda": "\uac10\uc0ac\uac1c\uc2dc", "law": "", "qna_type": "\ucd94\ucd9c\ud615", "context": "\ub300\uc804 \ub3d9\uad6c \ucd9c\uc2e0 \uc7a5\ucca0\ubbfc\uc785\ub2c8\ub2e4. \uc800\ud76c \ud658\ub178\uc704\uc5d0\uc11c \uae30\uc0c1\uccad \uad00\ub828\ub41c \uc0b0\u