In [None]:
# rag(retrieval augmented generation): 검색 증강 기법
# 아래는 stuff 방식을 manual하게 접근한 것입니다. 

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, PyPDFLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("../.cache/")


# # chunk_size는 토큰의 길이를 제한 / chunk_overlap은 앞 부분 일부분을 겹치게 만들게 합니다.
# splitter = RecursiveCharacterTextSplitter(
#     chunk_size=100,
#     chunk_overlap=10
# )

# 아래는 separator가 있습니다. 
# RecursiveCharacterTextSplitter의 기능을 모두 갖고 있으면서 seperator가 있기 때문에 해당 방법을 추천합니다. 
# 토큰화 방법에 대한 직관적인 방법을 보고 싶으면 해당 경로를 참고하세요
# https://platform.openai.com/tokenizer

splitter = CharacterTextSplitter(
    separator = '\n',
    chunk_size = 600,
    chunk_overlap= 50,
)

# 해당 loader는 pdf, txt, docx와 모두 호환됩니다.
loader = UnstructuredFileLoader("../files/_241117_chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

# embedding에 대한 직관적인 이해가 필요하다면 다음 링크를 참고한다.
# https://turbomaze.github.io/word2vecjson/
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)
# FAISS와 CHROMA를 변경할 수 있다
# vectorstore = Chroma.from_documents(docs[:10], cached_embeddings)
vectorstore = FAISS.from_documents(docs[:10], cached_embeddings)

retriver = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human", "{question}"),
])

chain = (
    {
        "context": retriver, 
        "question": RunnablePassthrough(), 
    }
    | prompt
    | llm )

chain.invoke("Desribe Victory Mansions")

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 1168, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 1110, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1182, which is longer than the specified 600
Created a chunk of size 1491, which is longer than the specified 600
Created a chunk of size 1401, which is longe

AIMessage(content='Victory Mansions is a building where Winston Smith resides. It is described as having glass doors through which gritty dust can enter. The hallway smells of boiled cabbage and old rag mats. There is a large colored poster on the wall depicting the face of a man with a black mustache. The building has a non-functioning lift due to the electricity being cut off during daylight hours as part of an economy drive. The flat where Winston lives is on the seventh floor, and there is a poster with a large face on each landing that seems to follow you as you move.')

In [4]:
# rag(retrieval augmented generation): 검색 증강 기법
# 아래는 map_reduce 방식을 manual하게 접근한 것입니다. 

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, PyPDFLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("../.cache/")


# # chunk_size는 토큰의 길이를 제한 / chunk_overlap은 앞 부분 일부분을 겹치게 만들게 합니다.
# splitter = RecursiveCharacterTextSplitter(
#     chunk_size=100,
#     chunk_overlap=10
# )

# 아래는 separator가 있습니다. 
# RecursiveCharacterTextSplitter의 기능을 모두 갖고 있으면서 seperator가 있기 때문에 해당 방법을 추천합니다. 
# 토큰화 방법에 대한 직관적인 방법을 보고 싶으면 해당 경로를 참고하세요
# https://platform.openai.com/tokenizer

splitter = CharacterTextSplitter(
    separator = '\n',
    chunk_size = 600,
    chunk_overlap= 50,
)

# 해당 loader는 pdf, txt, docx와 모두 호환됩니다.
loader = UnstructuredFileLoader("../files/_241117_chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

# embedding에 대한 직관적인 이해가 필요하다면 다음 링크를 참고한다.
# https://turbomaze.github.io/word2vecjson/
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)
# FAISS와 CHROMA를 변경할 수 있다
# vectorstore = Chroma.from_documents(docs[:10], cached_embeddings)
vectorstore = FAISS.from_documents(docs[:10], cached_embeddings)

retriver = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human", "{question}"),
])

# list of docs

# for doc in list of docs | prompt | llm

# for response in list of llms response | put them all together

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            use the following portion of a long document to see if any of 
            the text is relevant to answer the question. Return any relevant text
            verbatim
            ------
            {context}
            """
        ),
        ("human","{question}")
    ]
)

map_doc_chain = map_doc_prompt | llm


def map_docs(inputs):
    print(inputs)
    documents = inputs['documents']
    question = inputs['question']
    return "\n\n".join(
        [
            map_doc_chain.invoke(
                {
                    "context":document.page_content,
                    "question":question
                }
            ).content
                for document in documents
        ]
    )

map_chain = {"documents":retriver, 'question':RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given th following extracted parts of a long document and a question, 
            create a final answer.
            If you don't know the answer, just say that you don't know.
            Don't try to make up an answer.
            ------
            {context}
            """
        ),
        ("human","{question}"),
    ]   
)

chain =(
    {
        "context": map_chain, 
        "question": RunnablePassthrough() 
    } 
    | final_prompt 
    | llm
)

# chain = (
#     {
#         "context": retriver, 
#         "question": RunnablePassthrough(), 
#     }
#     | prompt
#     | llm )

chain.invoke("Desribe Victory Mansions")

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 1168, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 1110, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1182, which is longer than the specified 600
Created a chunk of size 1491, which is longer than the specified 600
Created a chunk of size 1401, which is longe

{'documents': [Document(page_content="Winston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of colourless liquid with a plain white label marked VICTORY GIN. It gave off a sickly, oily smell, as of Chinese ricespirit. Winston poured out nearly a teacupful, nerved himself for a shock, and gulped it down like a dose of medicine.", metadata={'source': '../files/_241117_chapter_one.txt'}), Document(page_content='It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly th

AIMessage(content='Victory Mansions is a dilapidated and run-down apartment building in London where Winston Smith, the protagonist of George Orwell\'s novel "1984," resides. The building is described as having shabby walls, faulty elevators, cramped and poorly maintained apartments with peeling wallpaper and faulty plumbing. It lacks basic amenities, and residents often face shortages of food and other necessities. The building has glass doors that let in gritty dust, a smelly hallway reeking of boiled cabbage and old rag mats, and a large colored poster with the caption "BIG BROTHER IS WATCHING YOU." The building has seven flights of stairs due to the often out-of-order lift, and it is part of a setting where the clocks strike thirteen on a bright cold day in April. Additionally, Victory Mansions is one of four similar buildings in London, housing the Ministry of Truth responsible for news, entertainment, education, and the fine arts in the government apparatus.')