In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# 출력 예쁘게 하기
from rich.console import Console
from rich.table import Table

console = Console()

def rich_docs(docs, max_len=140, title="Retriever Results"):
    table = Table(title=title)
    table.add_column("#", justify="right")
    table.add_column("Source")
    table.add_column("Page", justify="right")
    table.add_column("Preview")

    for i, d in enumerate(docs, 1):
        m = d.metadata or {}
        src = (m.get("source","") or "").split("/")[-1]
        page = str(m.get("page_label", m.get("page",0)+1))
        text = (d.page_content or "").strip().replace("\n", " ")
        table.add_row(str(i), src, page, (text[:max_len] + ("…" if len(text) > max_len else "")))

    console.print(table)

### retriever 설정값
- 일반 RAG 기본값 : similarity or mmr 
- 중복이 많을 경우 : mmr 
- 그 외 필터링이 필요한 경우 : search_kwargs 에 다양한 옵션값을 넣어주면 됩니다.

### advanced_retriever 
- 길이가 길 경우 : compressed_retriever -> 필요할 때 Parent-child 
- 용어가 중요할 경우 : hybrid(vec + bm25)
- 정확도 극대화 : similarity -> reranker -> reorder 


In [3]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma 

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
emd = OpenAIEmbeddings(model="text-embedding-3-small")
db_path = r"C:\PythonProject\potenup_10\7_vectorstore\chroma_store"
col_name = "samsung_all"
vecstore = Chroma(
    embedding_function=emd,
    collection_name=col_name,
    persist_directory=db_path,
)

vecstore._collection.count()

946

In [5]:
dim_size = emd.embed_query("안녕하세요")
print(len(dim_size))

1536


In [6]:
question = "삼성의 지속 가능성에 대해 알려줘"
# 1. 유사도 기반
ret_sim = vecstore.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k" : 5}
)
result = ret_sim.invoke(question)
rich_docs(result)

In [7]:
ret_mmr = vecstore.as_retriever(
    search_type = "mmr",
    search_kwargs = {"k" : 5,
                     "fetch_k" : 20,
                     "lambda_mult" : 0.5
                     }
)

result = ret_mmr.invoke(question)
rich_docs(result)

In [8]:
# 3. score 로 제어 - threshold 값 지정
ret_score = vecstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 8, "score_threshold": 0.2,
                   "filter":{"source":"../data/Sustainability_report_2024_kr.pdf"}
                   }
)

result = ret_score.invoke(question)
rich_docs(result, title ="score로 확인")

In [9]:
vecstore._collection.get(limit = 1)

{'ids': ['samsung_2024::f383dc54-4efc-4832-abe5-bd7809a453f9'],
 'embeddings': None,
 'documents': ['A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'creator': 'Adobe InDesign 15.1 (Macintosh)',
   'trapped': '/False',
   'producer': 'Adobe PDF Library 15.0',
   'page': 0,
   'source': '../data/Sustainability_report_2024_kr.pdf',
   'total_pages': 83,
   'moddate': '2024-11-25T11:10:46+09:00',
   'creationdate': '2024-11-25T11:10:32+09:00',
   'page_label': '1'}]}

# 4. 리오더 
- 리랭크와 같이 사용
- 리랭크에서는 맥락(내용의 흐름) 고려 x

In [10]:
from langchain_community.document_transformers import LongContextReorder
reorder = LongContextReorder()
reordered_result = reorder.transform_documents(result)

rich_docs(reordered_result, title="리오더 결과")