# 6 RAG


## 6.1 Data Loaders and Splitters


### txt


In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader

loader = TextLoader("./files/chapter_one.txt")

loader.load()


[Document(page_content='', metadata={'source': './files/chapter_one.txt'})]

### pdf


In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("./files/chapter_one.pdf")

loader.load()
# len(loader.load())


[Document(page_content="Part One    1  It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.   The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-ﬁve, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The ﬂat was seven ﬂights up, and Winston, who was thirty-nine and had a varicose ulcer above his

### UnstructuredFileLoader


In [None]:
from langchain.chat_models import ChatOpenAI
# txt, powerPoint, html, PDF도 docx, 이미지도 가능!
from langchain.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader("./files/chapter_one.docx")

loader.load()


### 문서 분할하기


In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,  # 문단 크기 조정
    chunk_overlap=50  # 문장/문단 분할 시 앞 조각 일부분을 가져오게 만듦 (중복 더하기)
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

loader.load_and_split(text_splitter=splitter)  # 문단 나누기


[Document(page_content='Part One', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='1', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='It depicted simply an enormous

In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter  # 특정 문자부터 자르기

splitter = CharacterTextSplitter.from_tiktoken_encoder( # .from_tiktoken_encoder 모델이 보는 문자로 나눌 수 있음(토큰)
    separator="\n",  # 해당 문자로 자를 수 있음
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

loader.load_and_split(text_splitter=splitter)  # 문단 나누기


Created a chunk of size 964, which is longer than the specified 600
Created a chunk of size 775, which is longer than the specified 600
Created a chunk of size 955, which is longer than the specified 600
Created a chunk of size 923, which is longer than the specified 600
Created a chunk of size 1169, which is longer than the specified 600
Created a chunk of size 822, which is longer than the specified 600
Created a chunk of size 701, which is longer than the specified 600
Created a chunk of size 746, which is longer than the specified 600
Created a chunk of size 736, which is longer than the specified 600
Created a chunk of size 1111, which is longer than the specified 600
Created a chunk of size 992, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 1742, which is longer than the specified 600
Created a chunk of size 2002, which is longer than the specified 600
Created a chunk of size 1901, which is longe

[Document(page_content='Part One \n1 \nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven fl

### Vector


In [3]:
from dotenv import load_dotenv

load_dotenv()  # This method will load all the environment variables from a .env file


True

In [4]:
from langchain.embeddings import OpenAIEmbeddings # 입베딩 모듈

embedder = OpenAIEmbeddings() # 임베딩 사용
# 문서 임베딩 또는 쿼리 임베딩
embedder.embed_query("HI") # HI 에 해당하는 벡터값 얻기

[-0.034233457914850496,
 -0.00451289158295572,
 -0.022449249122778434,
 -0.029967443645213074,
 -0.028861440359598586,
 0.0295461095924014,
 -0.016023892710207085,
 -0.0037031387450167787,
 0.00859128207470344,
 -0.02277841763210608,
 0.0201582403211715,
 -0.009776285728050758,
 -0.012666379875529514,
 -0.014325385735273812,
 0.007893445803504298,
 0.004397682790955557,
 0.04073781842044493,
 -0.012264795448989766,
 0.026478264151862496,
 -0.0009076804159128678,
 -0.013798717237936655,
 0.012969214308064508,
 -0.004963851472159127,
 -0.005398353029028409,
 -0.019328737391299353,
 -0.0013602862431234763,
 0.0068993586189683275,
 -0.021883081372897427,
 0.002549405178046249,
 -0.03302212018471053,
 0.00795269561364264,
 0.004078390026086438,
 -0.03057311251028306,
 -0.0175775641254259,
 -0.00945370166924384,
 -0.029835776986540066,
 -0.004450349547557016,
 -0.006682108073364328,
 0.013377382253802418,
 0.0007093567895116641,
 0.012962630788866345,
 -0.005395061735090609,
 -0.006497774192

In [7]:
vector = embedder.embed_query("HI") 

len(vector) # 몇 개의 차원들을 가지고 있는지 확인 가능

1536

In [10]:
vectors = embedder.embed_documents(["Hi","how","are","you"])

print(len(vectors),len(vectors[0])) # 4개 벡터, 차원 개수

4 1536


In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings # 입베딩 모듈   /// 돈 아끼기 위해 CacheBackedEmbeddings 캐시 사용
from langchain.vectorstores import Chroma # AI 기반의 오픈 소스 벡터 데이터베이스 ( 벡터 저장소, 임베딩 데이터베이스, 등)
from langchain.storage import LocalFileStore # 비용 절감용 storage

cache_dir = LocalFileStore("./.cache/") # cache폴더에 저장!


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx") # 문서 로드

docs = loader.load_and_split(text_splitter=splitter) # 문서 로드/스플릿

embeddings = OpenAIEmbeddings() # 임베딩 사용

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir) # 캐시저장소~

# Chroma 만들기
vectorstore = Chroma.from_documents(docs, cached_embeddings) # 문서 첨부

In [1]:
# 유사도 검사
results = vectorstore.similarity_search("where does winston live") # 문서의 벡터 변환 : 벡터 공간에 대한 탐색 가능!

#비슷한 문서 로드
# 이 내용을 LLM에게 넘길 수 있음
results

NameError: name 'vectorstore' is not defined

### RetrievalQA


In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings 
from langchain.vectorstores import Chroma 
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA  # 임포트 

cache_dir = LocalFileStore("./.cache/")


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx") 

docs = loader.load_and_split(text_splitter=splitter) 

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

# Chroma 만들기
vectorstore = Chroma.from_documents(docs, cached_embeddings)

llm = ChatOpenAI()
## 일종의 생성자 함수 Constructor 
chain = RetrievalQA.from_chain_type(
    llm=llm, # llm을 인자로 받음
    chain_type="stuff", # default 값 stuff
    retriever=vectorstore.as_retriever(),  # retriever : class의 인터페이스 / document(여기서는 vectorstore)를 다른 곳에서도 가져올 수 있음
)

# 질문하기
chain.run("where does Winston live?") 

'Winston Smith lives in Victory Mansions.'

In [6]:
chain.run("Describe Victory Mansions")

'Victory Mansions is a building where Winston Smith resides. It has glass doors and is described as having a hallway that smells of boiled cabbage and old rag mats. A large coloured poster of a man\'s face, about forty-five years old, with a heavy black mustache is tacked to the wall. The building lacks reliable amenities like a working lift due to the electricity being cut off during daylight hours as part of an economy drive. Inside the flat, there is a telescreen that cannot be completely shut off. The building is associated with the oppressive presence of Big Brother, as indicated by the poster\'s caption, "BIG BROTHER IS WATCHING YOU."'