# FAISS

In [11]:
from langchain.text_splitter import CharacterTextSplitter
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
from IPython.display import JSON
import os
import openai

In [2]:

text = """Generative AI, or generative artificial intelligence, is a type of AI system that can generate text, images, and other media in response to prompts.
Rather than merely analyzing existing data, it focuses on creating new content, marking a distinct field within AI. It began gaining significant popularity around 2022.

Through training on source data, it is used to generate a wide range of content such as novels, images, videos, code, music, and art. In South Korea, interest surged in 2022 with the emergence of image-generation AIs like Novel AI. Internationally, various models such as Midjourney and ChatGPT were released one after another, drawing widespread attention.

In typical deep learning AI, the encoding process—converting original data into an array-type numerical format before learning or producing results—is crucial. For generative AI, however, the decoding process is equally important. This is where the AI’s output data is converted back into the desired form, such as images or text.

This technology has essentially driven the popularization of artificial intelligence and has dramatically transformed public perception of AI.
"""
#CharacterTextSplitter을 이용한 청킹
splitter = CharacterTextSplitter(
separator="\n",
chunk_size=300,
chunk_overlap=50,
length_function=len
)

#주어진 문장 청킹
chunks = splitter.split_text(text)
print(chunks)

Created a chunk of size 357, which is longer than the specified 300
Created a chunk of size 330, which is longer than the specified 300


['Generative AI, or generative artificial intelligence, is a type of AI system that can generate text, images, and other media in response to prompts.', 'Rather than merely analyzing existing data, it focuses on creating new content, marking a distinct field within AI. It began gaining significant popularity around 2022.', 'Through training on source data, it is used to generate a wide range of content such as novels, images, videos, code, music, and art. In South Korea, interest surged in 2022 with the emergence of image-generation AIs like Novel AI. Internationally, various models such as Midjourney and ChatGPT were released one after another, drawing widespread attention.', 'In typical deep learning AI, the encoding process—converting original data into an array-type numerical format before learning or producing results—is crucial. For generative AI, however, the decoding process is equally important. This is where the AI’s output data is converted back into the desired form, such a

In [13]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)

#save text embeddings into FAISS FAISS(Vector Store)
knowledge_base = FAISS.from_texts(chunks, embeddings)

In [14]:
print(knowledge_base)

<langchain_community.vectorstores.faiss.FAISS object at 0x0000016A4718E480>


In [15]:
question = "what is generative AI?"
references = knowledge_base.similarity_search(question)

In [16]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)


chain = load_qa_chain(llm, chain_type="stuff") #Q&A Interface, 하지만 모든 document를 가져온다
with get_openai_callback() as cb:
    response = chain.run(input_documents=references, question=question)
    print(cb)

  llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm, chain_type="stuff") #Q&A Interface, 하지만 모든 document를 가져온다
  response = chain.run(input_documents=references, question=question)


Tokens Used: 313
	Prompt Tokens: 261
		Prompt Tokens Cached: 0
	Completion Tokens: 52
		Reasoning Tokens: 0
Successful Requests: 1
Total Cost (USD): $0.0004955000000000001


In [17]:

print(response)

Generative AI, or generative artificial intelligence, is a type of AI system that can generate text, images, and other media in response to prompts. It focuses on creating new content rather than just analyzing existing data, making it a distinct field within AI.


# Chroma

In [18]:
from langchain.document_loaders import PyPDFLoader

# PDF
loaders = [
    PyPDFLoader("e:/data/example1.pdf"),
    PyPDFLoader("e:/data/example2.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

ValueError: File path e:/data/스마트농업_육성사업_추진현황과_개선과제.pdf is not a valid file or url

In [None]:
# Chunking with Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
client = OpenAI(
  openai_api_key=OPENAI_API_KEY
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

#doc chunking with splitter
splits = text_splitter.split_documents(docs)
print(splits)

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
#embedding model
embedding = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)

In [None]:
#chroma vector storage path
persist_directory = 'e:/data/chroma/'

#vector db
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

print(vectordb._collection.count())

In [None]:
question = "한국형 스마트팜이란?"

docs = vectordb.similarity_search(question,k=3)  #return 3 docs

# doc length
print(len(docs))

# check the first doc content
print(docs[0].page_content)

# permanent storage
vectordb.persist()

In [None]:
question = "필요한 ICT 기술은?"

docs = vectordb.similarity_search(question,k=5) #return 5 docs

# check the first 2 doc contents
print(docs[0])
print('\n\n', docs[1])

# Hybrid Search

In [20]:
from langchain.retrievers import EnsembleRetriever # perform several retrievers
from langchain_community.retrievers import BM25Retriever  #TF-IDF search algorithm
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

In [22]:
doc_list_1 = [
    "프렌치 불독: 사교적이고 친근한 성격을 가지고 있으며, 조용하고 집에서 지내기에 적합 합니다",
    "비글: 호기심이 많고, 에너지가 넘치며, 사냥 본능이 강합니다. ",
    "독일 셰퍼드: 용감하고 지능적이며, 충성심이 강합니다",
    "포메라니안: 활발하고 호기심이 많으며, 주인에게 매우 애정적입니다",
    "치와와: 작지만 용감하고, 주인에게 깊은 애정을 보입니다",
    "보더 콜리:	매우 지능적이고 학습 능력이 뛰어나며, 에너지가 많아 많은 운동이 필요합니다 "
]

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    doc_list_1, metadatas=[{"source": 1}] * len(doc_list_1)
)
bm25_retriever.k = 2

doc_list_2 = [
    "프렌치 불독: 열에 약하므로 주의가 필요합니다",
    "비글: 가족과 잘 지내며, 아이들과 노는 것을 좋아합니다.",
    "독일 셰퍼드: 경찰견이나 구조견으로 많이 활용되며, 적절한 훈련과 운동이 필요합니다.",
    "포메라니안: 털이 풍성하므로 정기적인 그루밍이 필요합니다.",
    "치와와: 다른 동물이나 낯선 사람에게는 조심스러울 수 있습니다.",
    "보더 콜리: 목축견으로서의 본능이 강하며, 다양한 트릭과 명령을 쉽게 배울 수 있습니다."
]


#chuncking would be needed for actual cases (larger contents)
embedding = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)
faiss_vectorstore = FAISS.from_texts(
    doc_list_2, embedding, metadatas=[{"source": 2}] * len(doc_list_2)
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5] #retriever weight (to decide what retriever should we consider more between bm25 and faiss)
)

In [23]:
query = "충성심이 강한 강아지는?"
ensemble_result = ensemble_retriever.get_relevant_documents(query)
bm25_result = bm25_retriever.get_relevant_documents(query)
faiss_result = faiss_retriever.get_relevant_documents(query)

# 가져온 문서를 출력합니다.
print("[Ensemble Retriever]\n", ensemble_result, end="\n\n")
print("[BM25 Retriever]\n", bm25_result, end="\n\n")
print("[FAISS Retriever]\n", faiss_result, end="\n\n")

  ensemble_result = ensemble_retriever.get_relevant_documents(query)


[Ensemble Retriever]
 [Document(metadata={'source': 1}, page_content='독일 셰퍼드: 용감하고 지능적이며, 충성심이 강합니다'), Document(id='10946cbc-d7b6-4561-86fd-4061e8b6b4a8', metadata={'source': 2}, page_content='치와와: 다른 동물이나 낯선 사람에게는 조심스러울 수 있습니다.'), Document(metadata={'source': 1}, page_content='보더 콜리:\t매우 지능적이고 학습 능력이 뛰어나며, 에너지가 많아 많은 운동이 필요합니다 '), Document(id='e2e09cce-8305-4393-9781-f4fd93f56647', metadata={'source': 2}, page_content='보더 콜리: 목축견으로서의 본능이 강하며, 다양한 트릭과 명령을 쉽게 배울 수 있습니다.')]

[BM25 Retriever]
 [Document(metadata={'source': 1}, page_content='독일 셰퍼드: 용감하고 지능적이며, 충성심이 강합니다'), Document(metadata={'source': 1}, page_content='보더 콜리:\t매우 지능적이고 학습 능력이 뛰어나며, 에너지가 많아 많은 운동이 필요합니다 ')]

[FAISS Retriever]
 [Document(id='10946cbc-d7b6-4561-86fd-4061e8b6b4a8', metadata={'source': 2}, page_content='치와와: 다른 동물이나 낯선 사람에게는 조심스러울 수 있습니다.'), Document(id='e2e09cce-8305-4393-9781-f4fd93f56647', metadata={'source': 2}, page_content='보더 콜리: 목축견으로서의 본능이 강하며, 다양한 트릭과 명령을 쉽게 배울 수 있습니다.')]



In [24]:
query = "지능적인 강아지는?"
ensemble_result = ensemble_retriever.get_relevant_documents(query)
bm25_result = bm25_retriever.get_relevant_documents(query)
faiss_result = faiss_retriever.get_relevant_documents(query)

# 가져온 문서를 출력합니다.
print("[Ensemble Retriever]\n", ensemble_result, end="\n\n")
print("[BM25 Retriever]\n", bm25_result, end="\n\n")
print("[FAISS Retriever]\n", faiss_result, end="\n\n")

[Ensemble Retriever]
 [Document(metadata={'source': 1}, page_content='보더 콜리:\t매우 지능적이고 학습 능력이 뛰어나며, 에너지가 많아 많은 운동이 필요합니다 '), Document(id='e2e09cce-8305-4393-9781-f4fd93f56647', metadata={'source': 2}, page_content='보더 콜리: 목축견으로서의 본능이 강하며, 다양한 트릭과 명령을 쉽게 배울 수 있습니다.'), Document(metadata={'source': 1}, page_content='치와와: 작지만 용감하고, 주인에게 깊은 애정을 보입니다'), Document(id='10946cbc-d7b6-4561-86fd-4061e8b6b4a8', metadata={'source': 2}, page_content='치와와: 다른 동물이나 낯선 사람에게는 조심스러울 수 있습니다.')]

[BM25 Retriever]
 [Document(metadata={'source': 1}, page_content='보더 콜리:\t매우 지능적이고 학습 능력이 뛰어나며, 에너지가 많아 많은 운동이 필요합니다 '), Document(metadata={'source': 1}, page_content='치와와: 작지만 용감하고, 주인에게 깊은 애정을 보입니다')]

[FAISS Retriever]
 [Document(id='e2e09cce-8305-4393-9781-f4fd93f56647', metadata={'source': 2}, page_content='보더 콜리: 목축견으로서의 본능이 강하며, 다양한 트릭과 명령을 쉽게 배울 수 있습니다.'), Document(id='10946cbc-d7b6-4561-86fd-4061e8b6b4a8', metadata={'source': 2}, page_content='치와와: 다른 동물이나 낯선 사람에게는 조심스러울 수 있습니다.')]

