# RAG | ChromaDB
- pip install langchain_community
- pip install langchain_openai
- pip install langchain_chroma
- pip install pymupdf

### 모듈 불러오기

In [1]:
from langchain_community.document_loaders import PyPDFium2Loader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
import chromadb
from chromadb.config import Settings

- 현재 위치 확인

In [2]:
import os
os.getcwd()

'c:\\Wanted\\projectfiles\\KOMI_PJT\\LLM_Project\\KHS\\tests'

### 문서 불러오기
- pip install unstructured
- pip install pdfminer

In [4]:
# PDF 파일 2개 로드
loader1 = PyPDFium2Loader("../data/squat1.pdf")
docs1 = loader1.load()

loader2 = PyPDFium2Loader("../data/squat2.pdf")
docs2 = loader2.load()

# 하나로 합치기
all_docs = docs1 + docs2
len(all_docs)

31

### 문서 분할

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(all_docs)
len(split_docs)

100

### 문서 임베딩
- HuggingFace Transformers 기반
- SentenceTransformer("all-MiniLM-L6-v2")
- 이름: all-MiniLM-L6-v2
- 구조: MiniLM (Transformer 기반 소형 모델)
- 버전: v2 (Hugging Face에 공개된 두 번째 개정 모델)
- 임베딩 차원 수: 384차원
- 입력 길이: 최대 256 토큰

In [6]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [doc.page_content for doc in split_docs]
embeddings = embedding_model.encode(texts).tolist()
metadatas = [doc.metadata for doc in split_docs]
ids = [f"doc_{i}" for i in range(len(split_docs))]

### ChromaDB 저장

In [7]:
client = chromadb.PersistentClient(path="../chromaDB/pdf_docs")
collection = client.get_or_create_collection("squat_documents")

collection.add(
    documents=texts,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)

print("✅ PDF 2개의 데이터가 ChromaDB에 저장되었습니다.")

Insert of existing embedding ID: doc_0
Insert of existing embedding ID: doc_1
Insert of existing embedding ID: doc_2
Insert of existing embedding ID: doc_3
Insert of existing embedding ID: doc_4
Insert of existing embedding ID: doc_5
Insert of existing embedding ID: doc_6
Insert of existing embedding ID: doc_7
Insert of existing embedding ID: doc_8
Insert of existing embedding ID: doc_9
Insert of existing embedding ID: doc_10
Insert of existing embedding ID: doc_11
Insert of existing embedding ID: doc_12
Insert of existing embedding ID: doc_13
Insert of existing embedding ID: doc_14
Insert of existing embedding ID: doc_15
Insert of existing embedding ID: doc_16
Insert of existing embedding ID: doc_17
Insert of existing embedding ID: doc_18
Insert of existing embedding ID: doc_19
Insert of existing embedding ID: doc_20
Insert of existing embedding ID: doc_21
Insert of existing embedding ID: doc_22
Insert of existing embedding ID: doc_23
Insert of existing embedding ID: doc_24
Insert of 

✅ PDF 2개의 데이터가 ChromaDB에 저장되었습니다.


- retriever

In [None]:
# 검색기 만들기
embedding = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectorstore = Chroma(
    collection_name="squat_documents",
    persist_directory="../chromaDB/pdf_docs",
    embedding_function=embedding
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kawrgs={'k': 3}
)