In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader("./rag_data/chapter_one.pdf")

loader.load()

[Document(page_content="chapter_one.md\n\n2024-08-27\n\nPart 1, Chapter 1\n\nPart One\n\n1 It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into\n\nhis breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions,\n\nthough not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for\n\nindoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide:\n\nthe face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features.\n\nWinston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working,\n\nand at present the electric current was cut off during daylight hours. It was part of the economy drive in\n\npreparation for Hate Week. The flat was seven fl

In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache/")


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./rag_data/chapter_one.pdf")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

In [9]:
results = vectorstore.similarity_search("where does winston live")

results

[Document(page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never\nbeen inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except\non official business, and then only by penetrating through a maze of barbed-wire entanglements, steel\ndoors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by\ngorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was\nadvisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the\nMinistry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no\nfood in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's\nbreakfast. He took down from the shelf a bottle of colourl

In [4]:
len(results)

4

In [6]:
print(results[0])

page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never\nbeen inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except\non official business, and then only by penetrating through a maze of barbed-wire entanglements, steel\ndoors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by\ngorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was\nadvisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the\nMinistry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no\nfood in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's\nbreakfast. He took down from the shelf a bottle of colourless liquid

In [10]:
total_length = sum(len(doc.page_content) for doc in results)
print(f"Total length of returned documents: {total_length}")

Total length of returned documents: 10000


In [13]:
# 총 문서 수
total_docs = sum(1 for line in results)
print(total_docs)

4


In [14]:
#총 단어 수
total_words = sum(len(doc.page_content.split()) for doc in results)
print(total_words)

1745


In [15]:
#가장 긴 문서의 길이 찾기
longest_doc_length = max(len(doc.page_content) for doc in results)
print(f"Length of the longest document: {longest_doc_length}")

Length of the longest document: 2574


In [16]:
#가장 긴 문서의 길이를 리스트로 만들기!
doc_lengths = [len(doc.page_content) for doc in results]
print(f"Lengths of all documents: {doc_lengths}")

Lengths of all documents: [2401, 2574, 2511, 2514]


In [17]:
#특정 단어가 포함된 문서 수 계산하기 
word_to_find = "winston"
docs_with_word = sum(1 for doc in results if word_to_find in doc.page_content.lower())
print(f"Number of documents containing '{word_to_find}': {docs_with_word}")

Number of documents containing 'winston': 4


In [20]:
#특정 단어의 총 수 계산
word_to_find = "winston"
word_count = sum(doc.page_content.lower().count(word_to_find) for doc in results)
print(f"The word '{word_to_find}' appears {word_count} times in total.")

# word_to_find = "winston": 찾고자 하는 단어를 변수에 저장합니다.
# sum(... for doc in results): results 리스트의 각 문서(doc)에 대해 연산을 수행하고 그 결과를 합산합니다.
# doc.page_content.lower(): 각 문서의 내용을 소문자로 변환합니다. 이렇게 하면 대소문자 구분 없이 단어를 찾을 수 있습니다.
# .count(word_to_find): 변환된 문서 내용에서 word_to_find의 등장 횟수를 셉니다.
# 최종적으로 sum()은 각 문서에서 찾은 단어의 횟수를 모두 더합니다.

The word 'winston' appears 16 times in total.


In [21]:
# 이 방법은 대소문자를 구분하지 않고 정확히 "winston"이라는 단어의 등장 횟수만을 셉니다. 
# 만약 "Winston's"나 "Winston."과 같이 변형된 형태도 포함하고 싶다면, 정규 표현식을 사용하는 것이 좋습니다. 
# 그 경우 코드는 다음과 같이 변경될 수 있습니다:

# "winston"으로 시작하는 모든 단어를 찾습니다 
# (예: winston, winston's, winston에게 등). 필요에 따라 정규 표현식을 조정할 수 있습니다.
import re

word_to_find = "winston"
word_count = sum(len(re.findall(r'\b' + word_to_find + r'\w*\b', doc.page_content.lower())) for doc in results)
print(f"The word '{word_to_find}' and its variations appear {word_count} times in total.")

The word 'winston' and its variations appear 16 times in total.


In [30]:
# Chroma 는 vector store. 개인 컴퓨터에서 벡터 저장 -> FAISS 로 변경 가능 (기존 캐시 삭제)
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./rag_data/chapter_one.pdf")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=vectorstore.as_retriever(),
)

chain.run("Where Mensions does Winsthon live?")

"Winston lives in a flat in a building. The text doesn't specify the exact location or address of his residence."

In [32]:
# Lnagchain - Summarization - Refine / stuff
# Chroma 대신 --> FAISS 로 변경
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./rag_data/chapter_one.pdf")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff", 
	retriever=vectorstore.as_retriever(),
)

chain.run("Where does Winsthon live?")

'Winston lives in Victory Mansions, which are located in London, the chief city of Airstrip One, one of the provinces of Oceania.'

In [33]:
# Lnagchain - Summarization - chain_type="map_reduce"
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./rag_data/chapter_one.pdf")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="map_reduce", 
	retriever=vectorstore.as_retriever(),
)

chain.run("Where does Winsthon live?")

'Winston lives in Victory Mansions, a building in London, which is the chief city of Airstrip One, the third most populous province of Oceania in George Orwell\'s novel "1984."'

In [None]:
# Lnagchain - Summarization - chain_type="map_rerank"
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./rag_data/chapter_one.pdf")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="map_rerank", 
	retriever=vectorstore.as_retriever(),
)

chain.run("Where does Winsthon live?")