# Retriver

In [1]:
!pip install chromadb tiktoken transformers sentence_transformers openai langchain pypdf

Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.30.1-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.2.0-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.0-py3-none-any.whl (2.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.1 MB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m34.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: langchain_community
Successfully installed langchain_community-0.2.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import os
import openai

In [3]:
import tiktoken

#텍스트 토큰으로 분할
tokenizer = tiktoken.get_encoding("cl100k_base")

#토큰 수
def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [6]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader(" ")
pages = loader.load_and_split()

In [None]:
#page별로 split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, length_function = tiktoken_len)
texts = text_splitter.split_documents(pages)

from langchain.embeddings import HuggingFaceEmbeddings

#허깅페이스 모델을 이용해 임베딩 벡터로 변환
model_name = "jhgan/ko-sbert-nli"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Chroma Vectorstore에 저장
docsearch = Chroma.from_documents(texts, hf)

### MMR Retriever

- 유사도 + 다양성을 동시에 고려 / 중복된 내용 문서 제거

In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

openai = ChatOpenAI(model_name="gpt-3.5-turbo",
                    streaming=True, callbacks=[StreamingStdOutCallbackHandler()],
                    temperature = 0)

qa = RetrievalQA.from_chain_type(llm = openai,
                                 chain_type = "stuff",
                                 retriever = docsearch.as_retriever( #Chroma 객체를 retriever로 활용
                                    search_type="mmr", #다양하게 답변 구성
                                    search_kwargs={'k':3, 'fetch_k': 10}), #10개의 연관성 있는 문서를 뽑고 최대한 다양하게 구성하되 3개만 LLM에게 context로 넘긴다
                                 return_source_documents = True)

query = " "
result = qa(query)

### BM25 Retriever

- 문서 길이, 단어 빈도 -> 문서-쿼리 연관성
  
  단점: 의미적 관계 고려X -> 단점 보완: 앙상블 리트리버(BM25+임베딩)

### 앙상블 Retriever

- 키워드 기반 + 의미론적 유사성 (BM25 + 임베딩)

In [None]:
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.document_compressors import LengthBasedDocumentCompressor
from langchain.retrievers.document_compressors import LLMChainCompressor
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# BM25 리트리버 생성
bm25_retriever = BM25Retriever(
    collection=docsearch.as_retriever().get_relevant_documents,
    search_kwargs={'k1': 1.2, 'b': 0.75}
)

# 임베딩 기반 리트리버 생성
faiss_retriever = FaissRetriever(
    collection=docsearch.as_retriever().get_relevant_documents,
    search_k=10,
    k=3
)

# 리트리버 리스트 생성
retrievers = [bm25_retriever, faiss_retriever]

# 문서 압축기 생성 (선택 사항)
compressor = LLMChainCompressor(llm=OpenAI(temperature=0.5))

# Ensemble Retriever 생성
ensemble_retriever = EnsembleRetriever(
    retrievers=retrievers,
    document_compressor=compressor
)

# Streaming 출력을 위한 콜백 핸들러 생성
callback_handler = StreamingStdOutCallbackHandler()

# RetrievalQA 생성
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(temperature=0.5, callbacks=[callback_handler]),
    chain_type="stuff",
    retriever=ensemble_retriever,
    return_source_documents=True
)

# 질문 실행
query = ""
result = qa(query)

### MultiQuery Retriever

- 주어진 사용자 입력 쿼리에 대해 다양한 관점에서 여러 쿼리를 자동으로 생성하는 LLM -> 프롬포트 튜닝 과정 자동화

- 각각의 쿼리에서 관련 문서 집합 검색 -> 모든 쿼리를 아우르는 고유 문서들의 합집합 -> 더 큰 문서 집합

In [None]:
from langchain.chains import RetrievalQAChain
from langchain.llms import OpenAI
from langchain.retrievers import MultiQueryRetriever
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# Chroma 벡터스토어 생성
embeddings = OpenAIEmbeddings()
docsearch = Chroma(embedding_function=embeddings, persist_directory='chroma_data')

# MultiQueryRetriever 생성
retriever = MultiQueryRetriever(
    base_retriever=docsearch.as_retriever(search_type="mmr", search_kwargs={'k':3, 'fetch_k': 10}),
    query_generator=lambda query: [query, f"What is {query}?", f"Tell me about {query}"]
)

# RetrievalQAChain 생성
openai = OpenAI(temperature=0)
qa = RetrievalQAChain.from_llm(llm=openai, retriever=retriever, return_source_documents=True)

# 질문 실행
query = ""
result = qa(query)

In [None]:
result