In [1]:
import os
import re
import uuid
import sqlite3
from typing import List

from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma


In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
def is_file_registered(file_path, namespace: str) -> bool:

    path = os.path.join(file_path, namespace)
    return os.path.isdir(path)


def register_file(file_path, namespace: str):

    Chroma(persist_directory=file_path, collection_name=namespace)

In [4]:
def sanitize_namespace(file_path: str) -> str:
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    # 허용 문자 외를 '_'로 치환
    ascii_namespace = re.sub(r"[^a-zA-Z0-9._-]", "_", base_name)
    # 시작/끝이 영숫자가 아닐 경우 'a'로 보정
    if not re.match(r"^[a-zA-Z0-9]", ascii_namespace):
        ascii_namespace = "a" + ascii_namespace
    if not re.match(r".*[a-zA-Z0-9]$", ascii_namespace):
        ascii_namespace = ascii_namespace + "a"
    return ascii_namespace, base_name

def load_pdf_chunks(file_path: str) -> List:
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = splitter.split_documents(docs)
    return chunks

def embed_and_upsert(database_path: str, chunks: List, namespace: str):
    # 임베딩 모델 초기화
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    # Chroma 인스턴스 생성
    chroma = Chroma(
        persist_directory=database_path,
        collection_name=namespace,
        embedding_function=embeddings
    )

    # 문서 추가 (내부에서 텍스트, 메타데이터 및 벡터 자동 처리)
    print(f"📤 Adding {len(chunks)} documents to Chroma collection '{namespace}'...")
    chroma.add_documents(chunks)

    # Chroma DB에 저장
    print(f"✅ Upsert complete! namespace = '{namespace}'")

def main(file_path: str, database_path: str):
    if not os.path.exists(file_path):
        print(f"❌ File does not exist: {file_path}")
        return

    # 네임스페이스를 파일명 기반 자동 생성
    namespace, basename = sanitize_namespace(file_path)

    if is_file_registered(file_path=database_path, namespace=namespace):
        print(f"ℹ️ Collection already exists: {basename} (namespace: {namespace})")
        return

    print(f"📄 Loading pdf: {file_path}")
    chunks = load_pdf_chunks(file_path)
    print(f"🔗 Number of chunks: {len(chunks)}")

    if not chunks:
        print("❌ No chunk is extracted from pdf.")
        return

    embed_and_upsert(database_path, chunks, namespace)

    # Register namespace (collection)
    register_file(file_path=database_path, namespace=namespace)
    print(f"✅ Registered collection: {basename} in Chroma.")


In [5]:
if __name__ == "__main__":
    file_path = "data/2025학년도 전주대학교 정시 모집요강.pdf"
    database_path = 'database'
    main(file_path=file_path, database_path=database_path)

📄 Loading pdf: data/2025학년도 전주대학교 정시 모집요강.pdf
🔗 Number of chunks: 236
📤 Adding 236 documents to Chroma collection '2025_________________a'...
✅ Upsert complete! namespace = '2025_________________a'
✅ Registered collection: 2025학년도 전주대학교 정시 모집요강 in Chroma.
