In [1]:
import os, sys, shutil

from dotenv import load_dotenv

load_dotenv("../../_apikeys.env")
api_key = os.getenv("DoogieOpenaiKey")
os.environ['OPENAI_API_KEY'] = api_key

In [None]:
#
# Step 1-1: get test documents
#
import urllib.request

urllib.request.urlretrieve("https://github.com/chatgpt-kr/openai-api-tutorial/raw/main/ch06/2023_%EB%B6%81%ED%95%9C%EC%9D%B8%EA%B6%8C%EB%B3%B4%EA%B3%A0%EC%84%9C.pdf", filename="06_07_test.pdf")

In [2]:
#
# Step 1-2: load and split documents
# 17~20 sec
#
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader( "06_07_test.pdf" )
pages = loader.load_and_split()  # about 17s
#pages = loader.load()  # about 16s, include empty page


In [3]:
print( 'type loader,pages:', type(loader), type(pages) )
print( 'type pages[0]s:', type(pages[0]), type(pages[0].page_content))
print( 'size/len loader,pages:', sys.getsizeof(loader), len(pages) )
print( 'size/len pages[0]s:', sys.getsizeof(pages[0]), len(pages[0].page_content) )
print( 'pages Max:', max(len(apage.page_content) for apage in pages ) )
print( 'pages Min:', min(len(apage.page_content) for apage in pages ) )
print( 'pages Sum:', sum(len(apage.page_content) for apage in pages ) )
print( '#of chunks:', len(pages) )
print( 'Chunk Avg:', sum(len(apage.page_content) for apage in pages ) / len(pages) )

type loader,pages: <class 'langchain_community.document_loaders.pdf.PyPDFLoader'> <class 'list'>
type pages[0]s: <class 'langchain_core.documents.base.Document'> <class 'str'>
size/len loader,pages: 48 445
size/len pages[0]s: 72 57
pages Max: 1640
pages Min: 6
pages Sum: 372229
#of chunks: 445
Chunk Avg: 836.4696629213483


In [None]:
#print( pages[0] )
#print( pages[0].page_content)
#print( "example:", pages[6].page_content[:50] )
#print( pages[442].page_content )

In [4]:
# Step 2: case1:
# RecursiveCharacterTextSpilter를 사용함
#
from langchain.text_splitter import RecursiveCharacterTextSplitter

textSplitter = RecursiveCharacterTextSplitter(
    chunk_size= 1000,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""]
)

splitedDocs = textSplitter.split_documents( pages )

In [5]:
print( 'type split,splitDocs:', type(textSplitter), type(splitedDocs) )
print( 'type splitedDocs[0]s:', type(splitedDocs[0]), type(splitedDocs[0].page_content) )
print( 'size/len split,splitDocs:', sys.getsizeof(textSplitter), len(splitedDocs) )
print( 'size/len splitedDocs[0]s:', sys.getsizeof(splitedDocs[0]), len(splitedDocs[0].page_content) )
print( 'splitedDocs Max:', max(len(apage.page_content) for apage in splitedDocs ) )
print( 'splitedDocs Min:', min(len(apage.page_content) for apage in splitedDocs ) )
print( 'splitedDocs Sum:', sum(len(apage.page_content) for apage in splitedDocs ) )
print( '#of chunks:', len(splitedDocs) )
print( 'Chunk Avg:', sum(len(apage.page_content) for apage in splitedDocs ) / len(splitedDocs) )

type split,splitDocs: <class 'langchain_text_splitters.character.RecursiveCharacterTextSplitter'> <class 'list'>
type splitedDocs[0]s: <class 'langchain_core.documents.base.Document'> <class 'str'>
size/len split,splitDocs: 48 496
size/len splitedDocs[0]s: 72 57
splitedDocs Max: 1000
splitedDocs Min: 6
splitedDocs Sum: 372148
#of chunks: 496
Chunk Avg: 750.2983870967741


In [6]:
#
# Step3-1: using packages
# tmp, check OpenAI model
#
#from langchain.embeddings import OpenAIEmbeddings
# 'langchain.embeddings' deprecated, use 'langchain_openai'
from langchain_openai import OpenAIEmbeddings
#from langchain.openai import OpenAIEmbeddings

myEmbeddings = OpenAIEmbeddings()
print("current OpenAIEmbeddings().model=", myEmbeddings.model)
myEmbeddings = OpenAIEmbeddings(model="text-embedding-3-large")
print("current OpenAIEmbeddings().model=", myEmbeddings.model)


current OpenAIEmbeddings().model= text-embedding-ada-002
current OpenAIEmbeddings().model= text-embedding-3-large


In [13]:
#
# DONT USE THIS CODE
# Step3-2: Loading splitedDocs to chroma
# Error case: "Requested 367501 tokens, max 300000 tokens per request"
#
from langchain_chroma import Chroma
#from langchain.vectorstores import Chroma

myEmbeddings = OpenAIEmbeddings(model="text-embedding-3-small")
print("current OpenAIEmbeddings().model=", myEmbeddings.model)

vdb = Chroma.from_documents(splitedDocs, myEmbeddings)
print("vdb적재문서수=", vdb._collection.count())


current OpenAIEmbeddings().model= text-embedding-3-small


APIConnectionError: Connection error.

In [8]:
#
# Step3-2: setup env
#
from chromadb.config import Settings

settings = Settings(
    is_persistent=True,
    allow_reset=True
)

myEmbeddings = OpenAIEmbeddings(model="text-embedding-3-small")
persist_dir = "my_chroma_db11"
myCollectionName = "my_collection"

In [12]:
#
# Step3-2: setup env
#
def process_in_batches(documents, batch_size=100, reset_db=True):
    global vdb

    ### 1. 사용할 DB 디렉토리 및 jupyter chroma 존재 여부 확인 및 재 생성
    vdb = Chroma(
        client_settings=settings,
        collection_name=myCollectionName,
        embedding_function=myEmbeddings,
        persist_directory=persist_dir
    )

    return vdb

print("[START]")
vdb = process_in_batches(splitedDocs, batch_size=100, reset_db=True)

[START]


NameError: name 'Chroma' is not defined

In [7]:
#
# Step3-2: Loading splitedDocs to chroma
# chatgpt guide, use this code
#

def process_in_batches(documents, batch_size=100, reset_db=True):
    global vdb

    try:
        print("#0:vdb._client.delete_collection( name )", vdb._collection.name )
        #vdb._client.delete_collection( myCollectionName )
        vdb._client.delete_collection( vdb._collection.name )
        #print("#0:vdb._client.reset()", vdb)
        #vdb._client.reset()
        #del vdb
    except:
        pass

    #if vdb is not None:
    #    print("#0: vdb Not None, collection count:", vdb._collection.count())
    #else:
    #    print("#0: vdb is None")

    # --- 1. 기존 DB 디렉토리 삭제 (옵션) ---
    #print("#1:check current DB DIR, vdb=", vdb, persist_dir)
    #if reset_db and os.path.exists(persist_dir):
    #    print("#1-1 collection count:", vdb._collection.count())

    # --- 2. 새로운 Chroma DB 객체 준비 ---
    vdb = Chroma(
        client_settings=settings,
        collection_name=myCollectionName,
        embedding_function=myEmbeddings,
        persist_directory=persist_dir
    )
    print("#2: vdb,name,count=", vdb, vdb._collection.name, vdb._collection.count())

    # -- 새로은 Chroma DB라지만, jupyter에 흔적이 있음
    if vdb._collection.count() > 0:
        #vdb._collection.delete(None, collection_name, persist_dir)
        #vdb._client.reset()
        print("#2-1: vdb,name,count=", vdb, vdb._collection.name, vdb._collection.count())
        # delete collection
        #print("#0:vdb._client.delete_collection( name )", vdb._collection.name )
        #vdb._client.delete_collection( myCollectionName )
        #vdb._client.delete_collection( vdb._collection.name )
        # clear collection
        vdb._client.get_collection( myCollectionName ).delete(where={})

    # --- 3. 배치 단위로 문서 추가 ---
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        vdb.add_documents(batch)
        print(f"[INFO] 처리된 문서: {i + len(batch)}/{len(documents)} "
              f"(현재 총 {vdb._collection.count()} 개)")

    return vdb

# 사용 예시
vdb = process_in_batches(splitedDocs, batch_size=100, reset_db=True)
print("최종 vdb 문서 수 =", vdb._collection.count())

# Chroma 객체 사용 후 반드시 닫기
if vdb is not None:
    try:
        print("#x_begin vdb contents...",vdb )
        print( "_config_json", vdb._chroma_collection.configuration_json )
        print( "_chroma_collection_count", vdb._chroma_collection.count() )
        print( "_chroma_collection_name", vdb._chroma_collection.name )
        print( "_collection_count", vdb._collection.count() )
        print( "_collection_name", vdb._collection.name )
        print( "_client_list_collect", vdb._client.list_collections() )

        #print("#x: vdb.delete_collection()", vdb)
        #vdb.delete_collection()

        #print("#x: vdb._chroma_collection.delete()", vdb)
        #vdb._chroma_collection.delete()

        #print("#x: vdb._client.reset()", vdb)
        #vdb._client.reset()


        #vdb._client.close()  # langchain_chroma에서 사용 가능
        print("#x_end vdb contents...", vdb)
    except Exception as e:
        print("vdb close 실패:", e)
    vdb = None


NameError: name 'Chroma' is not defined

In [None]:
#
# Step3: too long splitedDocs
# chatgpt guide, use this code
#
import shutil, os
from langchain.embeddings import OpenAIEmbeddings
#from langchain_chroma.vectorstores import Chroma
from chromadb.config import Settings
from langchain_chroma import Chroma

settings = Settings(
    is_persistent=True,
    allow_reset=True
)

myEmbeddings = OpenAIEmbeddings(model="text-embedding-3-small")
persist_dir = "my_chroma_db14"
myCollectionName = "my_collection"

def process_in_batches(documents, batch_size=100, reset_db=True):
    global vdb

    # --- 1. 기존 DB 디렉토리 삭제 (옵션) ---
    print("#1:check current DB DIR, vdb=", vdb, persist_dir)
    if reset_db and os.path.exists(persist_dir):
        print("#1:enter reset_db and recreate DIR")
        shutil.rmtree(persist_dir)
        os.makedirs(persist_dir, exist_ok=True) # 확실히 쓰기 가능하게 생성

        print("#1x: vdb.delete_collection()", vdb)
        vdb.delete_collection()
        #print("#1x: vdb._chroma_collection.delete()", vdb)
        #vdb._chroma_collection.delete()
        print("#1x: vdb._client.reset()", vdb)
        vdb._client.reset()
        #vdb._client.close()  # langchain_chroma에서 사용 가능
        print(f"[INFO] '{persist_dir}' 초기화 완료")

    try:
        print("#0:vdb._client.reset()", vdb)
        vdb._client.reset()
        del vdb
    except:
        pass

    # --- 2. 새로운 Chroma DB 객체 준비 ---
    vdb = Chroma(
        client_settings=settings,
        collection_name=myCollectionName,
        embedding_function=myEmbeddings,
        persist_directory=persist_dir
    )
    print("#2:create current DB DIR, vdb=", vdb)

    # --- 3. 배치 단위로 문서 추가 ---
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        vdb.add_documents(batch)
        print(f"[INFO] 처리된 문서: {i + len(batch)}/{len(documents)} "
              f"(현재 총 {vdb._collection.count()} 개)")

    return vdb

# 사용 예시
vdb = process_in_batches(splitedDocs, batch_size=100, reset_db=True)
print("최종 vdb 문서 수 =", vdb._collection.count())

# Chroma 객체 사용 후 반드시 닫기
if vdb is not None:
    try:
        print("#x vdb contents...",vdb )
        print( "_config_json", vdb._chroma_collection.configuration_json )
        print( "_chroma_collection_count", vdb._chroma_collection.count() )
        print( "_chroma_collection_name", vdb._chroma_collection.name )
        print( "_collection_count", vdb._collection.count() )
        print( "_collection_name", vdb._collection.name )
        print( "_client_list_collect", vdb._client.list_collections() )

        print("#x: vdb.delete_collection()", vdb)
        vdb.delete_collection()
        #print("#x: vdb._chroma_collection.delete()", vdb)
        #vdb._chroma_collection.delete()
        print("#x: vdb._client.reset()", vdb)
        vdb._client.reset()
        print("#x: vdb contents...", vdb)
        #vdb._client.close()  # langchain_chroma에서 사용 가능
    except Exception as e:
        print("close 실패:", e)
    vdb = None


In [None]:
#
#
#
print( vdb._chroma_collection.configuration_json )
print( vdb._chroma_collection.count() )
print( vdb._collection.count() )
print( vdb._collection.name )
print( vdb._client.list_collections() )
print( vdb._chroma_collection.name )

In [None]:
#
# Step3: divided chunking, with batch_size=100
#
import shutil
import os
from langchain.embeddings import OpenAIEmbeddings
#from langchain.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

myEmbeddings = OpenAIEmbeddings(model="text-embedding-3-small")
print("current OpenAIEmbeddings().model=", myEmbeddings.model)

chroma_persist_dir = "my_chroma_db"

if os.path.exists(chroma_persist_dir):
    shutil.rmtree(chroma_persist_dir)

vdb = Chroma(
    embedding_function=myEmbeddings,
    persist_directory=chroma_persist_dir
)

print("vdb적재문서수a=", vdb._collection.count() )
vdb._client.delete_collection(vdb._collection.name)
#print("vdb적재문서수b=", vdb._collection.count() if vdb is not None else -1 )
vdb = process_in_batches(splitedDocs)
print("vdb적재문서수z=", vdb._collection.count() )

def process_in_batches(documents, batch_size=100):

    print("in func, vdb size:",vdb._collection.count() if vdb is not None else -1)

    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        vdb.add_documents(batch)
        print(f"처리된 문서: {i + len(batch)}/{len(documents)}", vdb._collection.count())
    return vdb

In [None]:
print( len( vdb ), type( vdb ) )

In [None]:
## exception sample

try:
    import chromadb
except Exception as e:
    print("CHROMADB LOAD ERROR:", repr(e))


try:
    import chromadb
except Exception as e:
    import traceback
    traceback.print_exc()