# 벡터DB : Chroma vs. Pinecone
 - Chroma : 인메모리 vector DB, 로컬 vector DB
 - Pinecone : 클라우드 vector DB
 (https://www.pinecone.io에서 api key 생성 -> .env에 추가(PINECONE_API_KEY 등록)

# 0. 패키지 설치

In [1]:
%pip install -q pinecone langchain-pinecone

Note: you may need to restart the kernel to use updated packages.




# 1. knowledge Base 구성을 위한 데이터 생성

In [2]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = Docx2txtLoader('data/소득세법(법률)(제21065호)(20260102).docx')
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap=200,
    #separators = ["\n\n","\n"," ",""]
)

document_list = loader.load_and_split(text_splitter=text_splitter)
len(document_list)

193

In [7]:
# embedding : OpenAI API text-embedding-3-large
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
load_dotenv()
embedding = OpenAIEmbeddings(model="text-embedding-3-large")

In [10]:
%%time
# pinecone vector database
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
import os
pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY")
)
# 데이터를 처음 업로드 할 때
# index_name = "tax-index"
# database = PineconeVectorStore.from_documents(
#     documents=document_list,
#     embedding=embedding,
#     index_name = index_name
# )

# 업로드한 벡터 db를 가져올때
database = PineconeVectorStore(
    embedding=embedding, #질문을 임베딩하여 유사도 검색
    index_name=index_name
)

CPU times: total: 0 ns
Wall time: 1 ms


# 2. 답변 생성을 위한 Retrieval

In [None]:
query = "연봉이 5천만원인 직장인의 소득세는 얼마인가요?"
retreived_docs = database.similarity_search(query, k=3)