In [None]:
import os
import json
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# 1. 환경 변수 로드
load_dotenv()

FOLDER_PATH = os.getenv("FOLDER_PATH")
VECTOR_STORE_PATH = os.getenv("VECTOR_STORE_PATH")
EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# 2. JSON 데이터에서 모든 키를 재귀적으로 탐색
def extract_all_keys(data, parent_key=""):
    documents = []

    if isinstance(data, dict):  # 딕셔너리 처리
        for key, value in data.items():
            full_key = f"{parent_key}.{key}" if parent_key else key
            documents.extend(extract_all_keys(value, full_key))
    elif isinstance(data, list):  # 리스트 처리
        for index, item in enumerate(data):
            full_key = f"{parent_key}[{index}]"
            documents.extend(extract_all_keys(item, full_key))
    else:  # 기본 데이터 처리
        documents.append(
            Document(
                page_content=str(data),
                metadata={"key_path": parent_key}
            )
        )
    return documents

# 3. JSON 파일 로드 및 Document 생성
def load_json_documents(folder_path):
    documents = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        data = json.load(f)
                        file_documents = extract_all_keys(data)
                        # 소스 경로 추가
                        for doc in file_documents:
                            doc.metadata["source"] = file_path
                        documents.extend(file_documents)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON in file {file_path}: {e}")
    return documents

# 4. 데이터 로드
try:
    docs = load_json_documents(FOLDER_PATH)
    print(f"Loaded {len(docs)} documents from JSON files.")
except Exception as e:
    print(f"Error loading documents: {e}")
    raise

# 5. 텍스트 분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
all_splits = text_splitter.split_documents(docs)
print(f"Split into {len(all_splits)} text chunks.")

# 6. 유효한 텍스트만 필터링
all_splits = [doc for doc in all_splits if doc.page_content.strip()]
if not all_splits:
    raise ValueError("No valid documents found for vector store.")
print(f"Filtered down to {len(all_splits)} valid text chunks.")

# 7. 벡터 스토어 저장
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
vector_store = Chroma(embedding_function=embeddings, persist_directory=VECTOR_STORE_PATH)
vector_store.add_documents(all_splits)
print(f"Documents successfully added to vector store at {VECTOR_STORE_PATH}.")


Loaded 1206942 documents from JSON files.
Split into 1152073 text chunks.
Filtered down to 1152073 valid text chunks.


  from .autonotebook import tqdm as notebook_tqdm
