In [1]:
# rag(retrieval augmented generation): 검색 증강 기법

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, PyPDFLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore("../.cache/")


# # chunk_size는 토큰의 길이를 제한 / chunk_overlap은 앞 부분 일부분을 겹치게 만들게 합니다.
# splitter = RecursiveCharacterTextSplitter(
#     chunk_size=100,
#     chunk_overlap=10
# )

# 아래는 separator가 있습니다. 
# RecursiveCharacterTextSplitter의 기능을 모두 갖고 있으면서 seperator가 있기 때문에 해당 방법을 추천합니다. 
# 토큰화 방법에 대한 직관적인 방법을 보고 싶으면 해당 경로를 참고하세요
# https://platform.openai.com/tokenizer

splitter = CharacterTextSplitter(
    separator = '\n',
    chunk_size = 600,
    chunk_overlap= 50,
)

# 해당 loader는 pdf, txt, docx와 모두 호환됩니다.
loader = UnstructuredFileLoader("../files/_241117_chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

# embedding에 대한 직관적인 이해가 필요하다면 다음 링크를 참고한다.
# https://turbomaze.github.io/word2vecjson/
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)
vectorstore = Chroma.from_documents(docs[:10], cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectorstore.as_retriever(),
)

chain.run("where does Whinston live?")

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 1168, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 1110, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1182, which is longer than the specified 600
Created a chunk of size 1491, which is longer than the specified 600
Created a chunk of size 1401, which is longe

'Winston lives in Victory Mansions, which is a building in London, the chief city of Airstrip One, a province of Oceania.'

In [2]:
chain.run("Describe Victory Mansions")

'Victory Mansions is a building where Winston Smith resides. It is a large, run-down apartment complex with a gritty and dusty atmosphere. The hallway smells of boiled cabbage and old rag mats. The building is located in London and is surrounded by other similar buildings that house the four Ministries of the government: the Ministry of Truth, the Ministry of Peace, the Ministry of Love, and the Ministry of Plenty. The building has seven flights of stairs, and the residents often face issues with the unreliable lift and the lack of electricity during daylight hours due to an economy drive in preparation for Hate Week.'

In [45]:
results = vectorstore.similarity_search("where does winston live")

results

[Document(page_content="Winston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of colourless liquid with a plain white label marked VICTORY GIN. It gave off a sickly, oily smell, as of Chinese ricespirit. Winston poured out nearly a teacupful, nerved himself for a shock, and gulped it down like a dose of medicine.", metadata={'source': '../files/_241117_chapter_one.txt'}),
 Document(page_content="Winston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitch