In [None]:
import os, sys, shutil

from dotenv import load_dotenv

load_dotenv("../../_apikeys.env")
api_key = os.getenv("DoogieOpenaiKey")
os.environ['OPENAI_API_KEY'] = api_key

In [None]:
#
# Step 1-1: get test documents
#
import urllib.request

urllib.request.urlretrieve("https://github.com/chatgpt-kr/openai-api-tutorial/raw/main/ch06/2023_%EB%B6%81%ED%95%9C%EC%9D%B8%EA%B6%8C%EB%B3%B4%EA%B3%A0%EC%84%9C.pdf", filename="06_07_test.pdf")

In [None]:
#
# Step 1-2: load and split documents
# 17~20 sec
#
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader( "06_07_test.pdf" )
pages = loader.load_and_split()  # about 17s
#pages = loader.load()  # about 16s, include empty page


In [None]:
print( 'type loader,pages:', type(loader), type(pages) )
print( 'type pages[0]s:', type(pages[0]), type(pages[0].page_content))
print( 'size/len loader,pages:', sys.getsizeof(loader), len(pages) )
print( 'size/len pages[0]s:', sys.getsizeof(pages[0]), len(pages[0].page_content) )
print( 'pages Max:', max(len(apage.page_content) for apage in pages ) )
print( 'pages Min:', min(len(apage.page_content) for apage in pages ) )
print( 'pages Sum:', sum(len(apage.page_content) for apage in pages ) )
print( '#of chunks:', len(pages) )
print( 'Chunk Avg:', sum(len(apage.page_content) for apage in pages ) / len(pages) )

In [None]:
#print( pages[0] )
#print( pages[0].page_content)
#print( "example:", pages[6].page_content[:50] )
#print( pages[442].page_content )

In [None]:
# Step 2: case1:
# RecursiveCharacterTextSpilter를 사용함
#
from langchain.text_splitter import RecursiveCharacterTextSplitter

textSplitter = RecursiveCharacterTextSplitter(
    chunk_size= 1000,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""]
)

splitedDocs = textSplitter.split_documents( pages )

In [None]:
print( 'type split,splitDocs:', type(textSplitter), type(splitedDocs) )
print( 'type splitedDocs[0]s:', type(splitedDocs[0]), type(splitedDocs[0].page_content) )
print( 'size/len split,splitDocs:', sys.getsizeof(textSplitter), len(splitedDocs) )
print( 'size/len splitedDocs[0]s:', sys.getsizeof(splitedDocs[0]), len(splitedDocs[0].page_content) )
print( 'splitedDocs Max:', max(len(apage.page_content) for apage in splitedDocs ) )
print( 'splitedDocs Min:', min(len(apage.page_content) for apage in splitedDocs ) )
print( 'splitedDocs Sum:', sum(len(apage.page_content) for apage in splitedDocs ) )
print( '#of chunks:', len(splitedDocs) )
print( 'Chunk Avg:', sum(len(apage.page_content) for apage in splitedDocs ) / len(splitedDocs) )

In [None]:
#
# DONT USE THIS CODE !!! use Step3-2 Code !!!
# Step3-1: Loading splitedDocs to chroma
# Error case: "Requested 367501 tokens, max 300000 tokens per request"
#
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
#from langchain.vectorstores import Chroma

myEmbeddings = OpenAIEmbeddings()
print("current OpenAIEmbeddings().model=", myEmbeddings.model)
myEmbeddings = OpenAIEmbeddings(model="text-embedding-3-large")
print("current OpenAIEmbeddings().model=", myEmbeddings.model)

persist_dir = "../localdb/my_chroma_db_01"
myCollectionName = "my_collection"

vdb = Chroma.from_documents(
    splitedDocs,
    myEmbeddings
)
print("vdb적재문서수=", vdb._collection.count())


In [None]:
#
# Step3-2: using package 'langchain_openai', not 'OpenAIEmbeddings'
#          from langchain.embeddings import OpenAIEmbeddings
#          'langchain.embeddings' deprecated, use 'langchain_openai'
#
#from langchain.embeddings.openai import OpenAIEmbeddings
#from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings

myEmbeddings = OpenAIEmbeddings(model="text-embedding-3-large")
print("current OpenAIEmbeddings().model=", myEmbeddings.model)

persist_dir = "../localdb/my_chroma_db_01"
myCollectionName = "my_collection"


In [None]:
#
# Step3-3: Loading splitedDocs to chroma
# chatgpt guide, use this code
#
from langchain_chroma import Chroma

def process_in_batches(documents, batch_size=100, reset_db=True):
    global vdb

    # -- 집어 넣을 문서 확인 --
    splitedDocsLen = len(splitedDocs)
    print(">> ready to insert splitedDocs, len = ", splitedDocsLen)

    if splitedDocsLen == 0:
        print("splitedDocs가 비어있습니다.")
        return 0

    # -- 문서 크기를 batch_size로 나누어 확인하기
    batchCount = splitedDocsLen // batch_size
    if splitedDocsLen % batch_size > 0:
        batchCount += 1
    print(">> batchsize is ", batch_size, "and batchCount:", batchCount)

    # -- 문서를 batch_size로 잘라서 집어 넣기
    for i, batch in enumerate(range(0, splitedDocsLen, batch_size)):
        print(">>", i, "th batch, ", batch, "..", batch + batch_size - 1, end="")
        batchDocs = splitedDocs[batch:batch + batch_size]
        vdb.add_documents(batchDocs)
        print(" ++ added batchDocs len:", len(batchDocs))
    print(">> vdb size:", len( vdb.get()["ids"] ))

def clear_before_addDoc():
    global vdb
    all_ids = vdb.get()["ids"]

    if ( len(all_ids) == 0 ):
        print(">> vdb size = 0 이므로 collection delete 하지 않음.")
        return

    # 컬렉션 내부 데이터 전체 삭제
    print(f">> vdb size = {len(all_ids)}, 벡터 컬렉션 데이터 삭제 시작...", end="")
    try:
        vdb._collection.delete(ids=all_ids)
        print("++벡터 컬렉션 데이터 삭제 완료.")
    except Exception as e:
        print("++벡터 컬렉션 삭제 실패:", e)
    print(">> vdb size = ", len( vdb.get()["ids"] ) )

# -- main body


# --- 기존(또는 새) 컬렉션 열기(임베딩 함수 전달) ---
vdb = Chroma(
    embedding_function=myEmbeddings,
    persist_directory=persist_dir,
    collection_name=myCollectionName
)

print("[START] vdb 크기=", len( vdb.get()["ids"] ) )
print(">>Persist Dir:", persist_dir)
print(">>myCollectionName:", myCollectionName)

clear_before_addDoc()
process_in_batches(splitedDocs, 100, True)

#print("vdb적재문서수=", vdb._collection.count())
print("[END] vdb 크기=", len( vdb.get()["ids"] ) )

In [None]:
#
# Step3-4: Use splitedDocs at chroma, already Chroma Data
#     only need: openAI API connection
#                myEmbeddings for Chroma
#
from langchain_chroma import Chroma

vdb = Chroma(
    embedding_function=myEmbeddings,
    persist_directory=persist_dir,
    collection_name=myCollectionName
)

print(">>Persist Dir:", persist_dir)
print(">>myCollectionName:", myCollectionName)
print(">>vdb 크기=", len( vdb.get()["ids"] ) )

In [None]:
ㅊ

In [None]:
for resultDoc in resultDocs:
    print('---' * 10)
    print(resultDoc.metadata.get("page_label"), resultDoc.page_content[:100] )


In [None]:
#
# 4-2: other handle of ChromaDB
#
vdb2 = Chroma(
    persist_directory=persist_dir,
    embedding_function=myEmbeddings,
    collection_name=myCollectionName
)
print('문서의 수:', vdb2._collection.count())
print( type(vdb), type(vdb2) )

question = '북한의 교육과정'
resultDocs = vdb2.similarity_search(question)
print('유사문서수:', len(resultDocs))
print( type(resultDocs), type(resultDocs[0]), type(resultDocs[0].page_content) )

for resultDoc in resultDocs:
    print('---' * 10)
    print(resultDoc.metadata.get("page_label"), resultDoc.page_content[:100] )


In [None]:
print( vdb is vdb2 )
print( vdb == vdb2 )
print( id(vdb) == id(vdb2), id(vdb), id(vdb2) )
print( vdb._collection.id == vdb2._collection.id, vdb._collection.id, vdb2._collection.id )
print('------')
print( type(vdb), type(db_from_file) )
print( type(vdb._client), type(db_from_file._client))

print('------')
print(vdb._collection.name, db_from_file._collection.name)
print(vdb._collection.name == db_from_file._collection.name) # True
print(vdb._client.list_collections() )
#print(vdb._client._settings.chroma_db_impl)
#print(db_from_file._client._settings.chroma_db_impl)# 둘 다 'duckdb+parquet'
#print(vdb._client._settings.persist_directory == db_from_file._client._settings.persist_directory)  # True


In [None]:
#
# 4-3
# find top 3, print similarity score
# List[Document] -> docs[0].metadata, docs[0].page_content
# List[Tuple[Document, float]] -> docs[0][0].metadata, docs[0][1]
question = '북한의 교육과정'
resultDocs = vdb2.similarity_search_with_relevance_scores(question, k=3)
print('유사문서수:', len(resultDocs))
print( type(resultDocs), type(resultDocs[0]) )
print( type(resultDocs[0][0]), type(resultDocs[0][0].page_content) )
print( type(resultDocs[0][1]), resultDocs[0][1] )

for resultDoc in resultDocs:
    print('---' * 10)
    print(resultDoc[1], resultDoc[0].page_content[:100])

for resultDoc, score in resultDocs:
    print('---' * 10)
    print(score, len(resultDoc.page_content) )