In [None]:
### Stuff Documents Chain(off-the-shelf) => 모든 document를 prompt에 넣음.
### 다양한 vector store, Documents chain을 조합해서 잘 맞는것을 찾자.
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma  # FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()
splitter = CharacterTextSplitter.from_tiktoken_encoder( 
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)
loader = UnstructuredFileLoader("./meeeemooo.md")
docs = loader.load_and_split(text_splitter=splitter)
embedder = OpenAIEmbeddings()
cache_dir = LocalFileStore("./.cache/")
cache_embedder = CacheBackedEmbeddings.from_bytes_store(
    embedder, cache_dir
)
vectorstore = Chroma.from_documents(docs, cache_embedder)


chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # refine, map_reduce, map_rerank
    retriever=vectorstore.as_retriever()
)

chain.run("What does 'git status' do?")

Created a chunk of size 688, which is longer than the specified 600


'`git status` is a Git command that shows the current status of the working directory and staging area. It displays information about tracked, untracked, modified, and staged files in the repository. This command helps users understand what changes have been made and what needs to be committed.'

In [None]:
### Stuff Documents Chain(LCEL)
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma  # FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1
)
splitter = CharacterTextSplitter.from_tiktoken_encoder( 
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)
loader = UnstructuredFileLoader("./meeeemooo.md")
docs = loader.load_and_split(text_splitter=splitter)
embedder = OpenAIEmbeddings()
cache_dir = LocalFileStore("./.cache/")
cache_embedder = CacheBackedEmbeddings.from_bytes_store(
    embedder, cache_dir
)
vectorstore = Chroma.from_documents(docs, cache_embedder)




retriever = vectorstore.as_retriever()
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human", "{question}")
])

chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

chain.invoke("보안은 어디서 설정해야하지?")

Created a chunk of size 688, which is longer than the specified 600


AIMessage(content='보안은 fireStore, storage의 규칙설정이나 google cloud api credential에서 설정해야 합니다.')

In [2]:
### MapReduce Documents Chain(LCEL)
# 원래 document chunk중 쓸만한거만 남기고, 개량된 document를 prompt에 전달하는것.
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma  # FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1
)
splitter = CharacterTextSplitter.from_tiktoken_encoder( 
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)
loader = UnstructuredFileLoader("./meeeemooo.md")
docs = loader.load_and_split(text_splitter=splitter)
embedder = OpenAIEmbeddings()
cache_dir = LocalFileStore("./.cache/")
cache_embedder = CacheBackedEmbeddings.from_bytes_store(
    embedder, cache_dir
)
vectorstore = Chroma.from_documents(docs, cache_embedder)




retriever = vectorstore.as_retriever()




map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm


def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    results = []
    for document in documents:
        result = map_doc_chain.invoke({"context": document.page_content, "question": question}).content
        results.append(result)
    results = "\n\n".join(results)
    return results


map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm
chain.invoke("보안은 어디서 설정해야하지?")

Created a chunk of size 688, which is longer than the specified 600


AIMessage(content='보안은 fireStore, storage의 규칙설정이나 google cloud api credential에서 설정해야합니다.')

In [None]:
### MapReduce Documents Chain (LCEL)
# 원래 document chunk 중 질문과 관련 있는 부분만 추출(Map),
# 그 추출된 부분을 모아 최종 답을 생성(Reduce)하는 RAG 체인 예시

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

# 1) LLM 세팅
llm = ChatOpenAI(
    temperature=0.1
)

# 2) 문서 로드 & 청크화
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)
loader = UnstructuredFileLoader("./meeeemooo.md")
docs = loader.load_and_split(text_splitter=splitter)

# 3) 임베딩 + 캐시 + 벡터스토어
embedder = OpenAIEmbeddings()
cache_dir = LocalFileStore("./.cache/")
cache_embedder = CacheBackedEmbeddings.from_bytes_store(
    embedder, cache_dir
)
vectorstore = Chroma.from_documents(docs, cache_embedder)

# retriever 
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

# 4) Map 단계 프롬프트: 각 청크에서 질문 관련 구절만 추출
map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question.
            Return any relevant text verbatim. 
            If there is no relevant text, return nothing.
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)
map_doc_chain = map_doc_prompt | llm

# 5) Map 함수: retriever 결과 문서들을 순회하며 관련 구절만 뽑아 합침
def map_docs(inputs):
    documents = inputs["documents"]      # retriever가 반환한 상위 k개 청크
    question = inputs["question"]        # 사용자 질문
    results = []
    for document in documents:
        result = map_doc_chain.invoke({"context": document.page_content, "question": question}).content
        results.append(result)
    return "\n\n".join(results)

# 6) Map 체인: 질문 → retriever + 질문 그대로 전달 → map_docs
map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | RunnableLambda(map_docs)

# 7) Reduce 단계 프롬프트: Map에서 뽑힌 부분을 모아 최종 답 작성
final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

# 8) 최종 체인 = Map + Reduce
chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

# 9) 실행 예시
print(chain.invoke("보안은 어디서 설정해야하지?").content)
