<a href="https://colab.research.google.com/github/JSJeong-me/KOSA_ChatGPT_0531/blob/main/LangChain-txt-QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 여러 문서에서 찾아서 답변하는 챗봇 만들기




- QA ChatBot
- LangChain
- ChatGPT (gpt-3.5-turbo)
- ChromaDB

> Reference: https://youtu.be/3yPBVii7Ct0

In [None]:
!pip install -q langchain openai tiktoken chromadb

## 여러 문서

> TechCrunch 기사 21개

In [None]:
!wget -q https://github.com/kairess/toy-datasets/raw/master/techcrunch-articles.zip
!unzip -q techcrunch-articles.zip -d articles

## Setting up LangChain

OpenAI API Key

https://platform.openai.com/account/api-keys

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "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"

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

## Load multiple and process documents

In [None]:
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./articles', glob="*.txt", loader_cls=TextLoader)

documents = loader.load()

len(documents)

## Split texts

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

len(texts)

In [None]:
texts[2:4]

## Create Chroma DB

1. Text -> Embbedings
2. `db` 폴더에 데이터 저장
3. DB 초기화
4. `db` 폴더로부터 DB 로드

In [None]:
persist_directory = 'db'

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embedding,
    persist_directory=persist_directory)

In [None]:
vectordb.persist()
vectordb = None

In [None]:
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding)

## Make a retriever

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("What is Generative AI?")

for doc in docs:
    print(doc.metadata["source"])

### 결과를 k개 반환

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [None]:
docs = retriever.get_relevant_documents("What is Generative AI?")

for doc in docs:
    print(doc.metadata["source"])

## Make a chain

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True)

In [None]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

## Query

In [None]:
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
llm_response

In [None]:
query = "Who led the round in Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What did Databricks acquire?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is Generative AI?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "Who is CMA?"
llm_response = qa_chain(query)
process_llm_response(llm_response)