In [None]:
### Data loader and Splitters ###
from typing import Text
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader  # for txt files
from langchain.document_loaders import PyPDFLoader # for PDF files
from langchain.document_loaders import UnstructuredFileLoader  #PDF 포함 다양한 파일타입 사용가능
from langchain.text_splitter import RecursiveCharacterTextSplitter # Splitter.  문단이나 문장의 끝을 잘라줌.
                                    # 중간에 잘라먹지 않음.
from langchain.text_splitter import CharacterTextSplitter #seperator를 갖고 있다.

splitter = CharacterTextSplitter(
    separator="\n",  # 이 인자 기준으로 분할하기
    chunk_size = 600, # 분할할 청크 크기
    chunk_overlap= 100   # 청크의 앞뒤를 다소 겹치게 구분해줌
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

#split 방법1
#docs = loader.load()
#splitter.split_documents(docs)

#split 방법2
loader.load_and_split(text_splitter=splitter)


In [None]:
### Tiktoken ###
from typing import Text
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader  #PDF 포함 다양한 파일타입 사용가능
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,  # character단위가 아닌 LLM에 익숙한 Chunk 단위로 나누는 것
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx")

In [None]:
### embed ###
#embed는 사람이 읽는 텍스트를 텀퓨터가 이해할 수 있는 숫자로 변환하는 작업
### vectorization(벡터화) ### 우리가 만든 문서마다 embed 한 데이터를 활용하여 각각의 벡터를 만들어 줌.
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

#vector = embedder.embed_query("Hi")  #결과로 입력한 단어를 표현하는 vector를 얻게된다.
#len(vector)                          #단어를 표현하는 vector 수가 곧 차원의 수=1536!!!
vector = embedder.embed_documents(["Hi","boy","nice to meet you!"])
 #여러가지 단어 및 문장을 한번에 임베딩할 수 있음.
print(len(vector), len(vector[0]))  # 3, 1534
# 임베딩은 비용이 들므로 캐싱해야 한다. 문서가 바뀌지 않는 이상 임베딩 값도 바뀌지 않는다.

In [None]:
###  vector store ### 
#일종의 데이터베이스 = 벡터공간에서 검색을 할 수 있게 해줌.
# 순서: embed->caching->vector store에 저장->검색(관련있는 문서만 찾아냄)
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader 
from langchain.text_splitter import CharacterTextSplitter
#무료인 local에서 구동되는 vector store인 Chroma
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,  
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,cache_dir
)  #임베딩 객체와 캐시용 디렉토리

vectorstore = Chroma.from_documents(docs,cached_embeddings) #Chroma 초기화,
                     # 청크단위로 나눠진 리스트와 임베딩용클래스의 인스턴스 전달
                     

In [None]:
#임베딩 코드를 자꾸 돌리면 과금이 된다... 위 코드를 한번 수행한 상태로 
#vector store 검색
vectorstore.similarity_search("Where does winston live")

In [None]:
### stuff chain - RetrivalQA ###
### document GPT
# 이미 만들어진 체인을 사용하자. but 이것들은 Legacy. 현재 가장 추천되는 것은 LCEL
     #retriver(선별)은 class의 interface
     # document를 많은 장소로부터(vector store 뿐만 아니라 database, cloud 등에서도)
     # 선별할 수 있음.(찾아올 수 있음) 
from langchain.document_loaders import UnstructuredFileLoader 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,  
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,cache_dir
)

vectorstore = FAISS.from_documents(docs,cached_embeddings) # Chroma를 FAISS로 바꿈

chain = RetrievalQA.from_chain_type(
    llm = llm,
 #   chain_type="stuff",   #RetrivalQA의 장점은 chain_type=""부분은 바꿔 써넣는 것만으로 chain을 쉽게 바꿀 수 있는 점
 #   chain_type="refine",
 #   chain_type="map_reduce",
     chain_type="map_rerank", 
    retriever = vectorstore.as_retriever(),
)
#chain.run("Where does Winston live")
chain.run("Describe Victory Mansions")
'''The description of Victory Mansions in "1984" is further emphasized by the overall grim and oppressive atmosphere of the society in which Winston lives. The dilapidated state of the building, combined with the constant surveillance and fear experienced by its residents, serves as a reflection of the decay and despair that permeates all aspects of life under the Party\'s control. The stark contrast between the rundown living conditions within Victory Mansions and the grandeur of the Party\'s institutions, such as the Ministry of Love, highlights the stark inequalities and oppressive nature of the society depicted in the novel. Winston\'s daily struggles, such as sacrificing his lunch and enduring the harsh taste of Victory Gin, further illustrate the bleak and oppressive reality of life in Victory Mansions and the wider society'''
#chain_type="stuff"에서는 결과가 나오지 않았으나 refine에서는 결과는 위와 같이 얻음
# Chroma를 FAISS로 바꾸었으나 결과는 아래처럼 이전과 같음
'''The description of Victory Mansions in "1984" is further emphasized by the overall grim and oppressive atmosphere of the society in which Winston lives. The dilapidated state of the building, combined with the constant surveillance and fear experienced by its residents, serves as a reflection of the decay and despair that permeates all aspects of life under the Party\'s control. The stark contrast between the rundown living conditions within Victory Mansions and the grandeur of the Party\'s institutions, such as the Ministry of Love, highlights the stark inequalities and oppressive nature of the society depicted in the novel. Winston\'s daily struggles, such as sacrificing his lunch and enduring the harsh taste of Victory Gin, further illustrate the bleak and oppressive reality of life in Victory Mansions and the wider society'''
#map-reduce 결과 : 캐시도 지웠는데 왜 같아보이지
'''The description of Victory Mansions in "1984" is further emphasized by the overall grim and oppressive atmosphere of the society in which Winston lives. The dilapidated state of the building, combined with the constant surveillance and fear experienced by its residents, serves as a reflection of the decay and despair that permeates all aspects of life under the Party\'s control. The stark contrast between the rundown living conditions within Victory Mansions and the grandeur of the Party\'s institutions, such as the Ministry of Love, highlights the stark inequalities and oppressive nature of the society depicted in the novel. Winston\'s daily struggles, such as sacrificing his lunch and enduring the harsh taste of Victory Gin, further illustrate the bleak and oppressive reality of life in Victory Mansions and the wider society'''
#map-rerank : #deprecated 라는 워닝과 함께 결과를 받지 못했으나 rating 된 것은 langsmith에서 확인
#관련된 것이 발견될수록 높은 점수

In [None]:
### Stuff LCEL Chain ###
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,  
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs,cached_embeddings) 

retriever = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system","You are a helpful assistant. Answer question using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human","{question}"),
])
chain = {"context":retriever, "question":RunnablePassthrough()} | prompt | llm
# RunnablePassthrough -  입력값을 그저 통과하게 해 주는 단순 클래스 so 입력값 활용 시 어디든 쓸 수 있음 
chain.invoke("Descrive Victory Mansions")

In [1]:
### Map Reduce LCEL Chain ###

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
                    # RunnableLambda : 어떤 chain 내부 어디서든지 function 호출할 수 있게 해줌

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,  
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs,cached_embeddings) 

retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        ("system",
        """Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim.
        -------
        {context}
        """),
        ("human","{question}"),
    ]
)
map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context":doc.page_content,"question":question}
        ).content
        for doc in documents  #!!!
    )
    # results = []
    # for document in documents:
    #     result = map_doc_chain.invoke( #result = AImessage
    #         {"context":document.page_content,"question":question}
    #     ).content
    #     results.append(result)
    # print("results--",results)
    # results = "\n\n".join(results)#join- list의 내용을 모아
                                #이 함수를 호출한 스트링을 끼워넣어 합체하는 함수
    #return results  # 각 문서에서 추출된 결과를 하나의 스트링으로 합쳐서 리턴

map_chain = {
    "documents":retriever,
    "question":RunnablePassthrough()
} | RunnableLambda(map_docs)  #아무 함수든 실행할 수 있게 해줘

final_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """
     Given the following extracted parts of a long document and a question, create a final answer. If you don't know the answer, just say that you don't know. Don't try to make up an answer.
     ------
     {context}
     """
     ),
    ("human","{question}")
])

chain = {"context":map_chain, "question":RunnablePassthrough()} | final_prompt | llm

chain.invoke("Describe Victory Mansion")

AIMessage(content='Victory Mansion is a dilapidated apartment complex located in London, chief city of Airstrip One, in the novel "1984" by George Orwell. It is described as a run-down building with cramped living spaces, poor conditions, and a faulty lift. The hallway smells of boiled cabbage and old rag mats, with a large colored poster depicting an enormous face of a man with a black mustache. The building lacks basic amenities, and the residents are constantly under surveillance by the Party, as indicated by posters with the caption "BIG BROTHER IS WATCHING YOU." The atmosphere is one of fear and paranoia, reflecting the oppressive and austere environment of the dystopian society in which Winston, the protagonist, lives.')