# 임베딩 재설정

변수 초기화

In [4]:
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
import os
from dotenv import load_dotenv

# 현재 작업 디렉토리를 얻습니다.
current_dir = os.getcwd()

# .env 파일의 경로를 지정합니다.
dotenv_path = os.path.join(current_dir, 'model', '.env')
load_dotenv(dotenv_path)

# MongoDB 연결 설정
client = MongoClient(os.getenv("MONGODB_ATLAS_CLUSTER_URI"))
db_name = "HelloWorld-AI"
collection_name = "foreigner_legalQA"
db = client[db_name]
collection = db[collection_name]

# OpenAI Embeddings 초기화
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_KEY")
embedding = OpenAIEmbeddings() #기본은 text-embedding-ada-002

임베딩 재설정

In [13]:
def recreate_embeddings():
    for doc in collection.find():
        text = doc.get("text", "")
        if text:
            vector = embedding.embed_query(text)
            collection.update_one({"_id": doc["_id"]}, {"$set": {"embedding": vector}})
    print("Embeddings recreated for all documents")

recreate_embeddings()

DB 초기화

In [5]:
vector_store = MongoDBAtlasVectorSearch(
    collection=collection, #임베딩 대상 콜렉션
    embedding=embedding, #임베딩 소스 여기선 OPenai embedding
    index_name="vector_index", #임베딩 인덱스
    text_key="text", #텍스트 내용 저장된 필드
    embedding_key="embedding" # 벡터가 저장된 필드 이름
)

데이터 확인

In [9]:
sample_doc = collection.find_one({"embedding": {"$exists": True}})
if sample_doc:
    print("Sample document with embedding:")
    print(f"Text: {sample_doc.get('text', '')[:100]}...")
    print(f"Embedding length: {len(sample_doc.get('embedding', []))}")
    print(f"Embedding type: {type(sample_doc['embedding'])}")
else:
    print("No documents found with embeddings.")

Sample document with embedding:
Text: 중도 퇴사 후 근로소득 신고되지 않아 고용허가연장 안된 노동자 지원 작성일: 23-11-27
상담유형: 체류자격
거주지역: 여주시
국적: 우즈베키스탄
체류자격: E-9
상담내용: ...
Embedding length: 1536
Embedding type: <class 'list'>


In [11]:
vector_store = MongoDBAtlasVectorSearch(
    collection=collection,
    embedding=embedding,
    index_name="vector_index",
    text_key="text",
    embedding_key="embedding"
)

In [12]:
query = "한국의 노동 휴게시간"
try:
    similar_docs = vector_store.similarity_search(query, k=3)
    print(f"Query: {query}")
    print(f"Number of similar docs found: {len(similar_docs)}")
    for i, doc in enumerate(similar_docs):
        print(f"Document {i+1}:")
        print(f"Content: {doc.page_content[:100]}...")
        print(f"Metadata: {doc.metadata}")
        print("---")
except Exception as e:
    print(f"Error during similarity search: {type(e).__name__}")
    print(f"Error message: {str(e)}")
    if hasattr(e, 'details'):
        print(f"Error details: {e.details}")

Error during similarity search: OperationFailure
Error message: PlanExecutor error during aggregation :: caused by :: embedding is not indexed as vector, full error: {'ok': 0.0, 'errmsg': 'PlanExecutor error during aggregation :: caused by :: embedding is not indexed as vector', 'code': 8, 'codeName': 'UnknownError', '$clusterTime': {'clusterTime': Timestamp(1725012212, 4), 'signature': {'hash': b'\xe9\xc6\xaa1\xbc\x13\xd7^\xf7\xaf\x15L\xe4S\xe6J^\x00\x12\xa6', 'keyId': 7367405561744195589}}, 'operationTime': Timestamp(1725012212, 4)}
Error details: {'ok': 0.0, 'errmsg': 'PlanExecutor error during aggregation :: caused by :: embedding is not indexed as vector', 'code': 8, 'codeName': 'UnknownError', '$clusterTime': {'clusterTime': Timestamp(1725012212, 4), 'signature': {'hash': b'\xe9\xc6\xaa1\xbc\x13\xd7^\xf7\xaf\x15L\xe4S\xe6J^\x00\x12\xa6', 'keyId': 7367405561744195589}}, 'operationTime': Timestamp(1725012212, 4)}


유사도 계산

In [7]:
# 검색 쿼리 실행
query = "한국의 노동 휴게시간"
try:
    similar_docs = vector_store.similarity_search(query, k=3)
    print(f"Query: {query}")
    print(f"Number of similar docs found: {len(similar_docs)}")
    for i, doc in enumerate(similar_docs):
        print(f"Document {i+1}:")
        print(f"Content: {doc.page_content[:100]}...")  # 처음 100자만 출력
        print(f"Metadata: {doc.metadata}")
        print("---")
except Exception as e:
    print(f"Error during similarity search: {type(e).__name__}, {str(e)}")

Error during similarity search: OperationFailure, PlanExecutor error during aggregation :: caused by :: embedding is not indexed as vector, full error: {'ok': 0.0, 'errmsg': 'PlanExecutor error during aggregation :: caused by :: embedding is not indexed as vector', 'code': 8, 'codeName': 'UnknownError', '$clusterTime': {'clusterTime': Timestamp(1725011868, 6), 'signature': {'hash': b'P\xff\x0c\xa1p+\x88\x80\xc7\xcb\xd9`{\xb5\x82\xddq\x88\xb0/', 'keyId': 7367405561744195589}}, 'operationTime': Timestamp(1725011868, 6)}
