In [3]:
from llama_index import Document, VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.readers.chroma import ChromaReader
from llama_index.storage.storage_context import StorageContext
# from transformers import AutoTokenizer, AutoModel
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.schema import MetadataMode
from IPython.display import Markdown, display
import chromadb
import pandas as pd 
import openai
import os
import getpass

In [4]:
data_path = os.path.join('/workspace/data/')
index_path = os.path.join('/workspace/db/local')

In [6]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
openai.api_key = os.environ["OPENAI_API_KEY"]

OpenAI API Key: ········


In [7]:
from llama_index import Document, VectorStoreIndex 
from llama_index.node_parser import SentenceSplitter

In [8]:
data = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_LOAN_vec.csv'))
data.head(1)

Unnamed: 0,대출_고유키,상품특징,대출신청자격,상품설명
0,대출_0,9대 선도산업 등 정부발표 혁신성장 분야에 대한 효율적 금융지원,○ (지원대상) 다음 중 하나에 해당하는 기업 혁신성장 정책금융협의회 선정 「혁신성...,9대 선도산업 등 정부발표 혁신성장 분야에 대한 효율적 금융지원


In [9]:
category = 'loan'

id_list = data.대출_고유키.values.tolist()
desc_list = data.상품특징.values.tolist()

In [10]:
documents = []
for idx in range(len(desc_list)):
    doc = Document(text=desc_list[idx], doc_id=id_list[idx], metadata={"category": category}, excluded_llm_metadata_keys = ['category'])
    documents.append(doc)

In [11]:
print(f'embedding model see this: {documents[2].get_content(metadata_mode=MetadataMode.EMBED)}', end='\n\n')
print(f'LLM see this: {documents[2].get_content(metadata_mode=MetadataMode.LLM)}')

embedding model see this: category: loan

동산·채권 담보대출 활성화 지원

LLM see this: 동산·채권 담보대출 활성화 지원


In [12]:
from llama_index.node_parser import SentenceSplitter 

In [13]:
parser = SentenceSplitter(chunk_size=512, chunk_overlap=30)   # SentenceSplitter(chunk_size=1024, chunk_overlap=20)
nodes = parser.get_nodes_from_documents(documents)

In [14]:
model_name = 'kakaobank/kf-deberta-base'
embed_model = HuggingFaceEmbedding(model_name=model_name, embed_batch_size=32)

In [17]:
from llama_index import ServiceContext
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores import SimpleVectorStore
from llama_index.storage.index_store import SimpleIndexStore

In [18]:
persist_dir=os.path.join(index_path, 'features')

In [15]:
service_context = ServiceContext.from_defaults(node_parser=parser, embed_model=embed_model)
# storage_context = StorageContext.from_defaults(persist_dir=persist_dir)

In [19]:
# build index 
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context #, storage_context=storage_context
)

In [20]:
index.index_id

'6b59cf13-6f4c-4420-b175-83c1bee5f32b'

In [None]:
index.as_query_engine().query('보증 자격').get_context()

In [None]:
# set index_id to save multiple indexes to the same folder  -> persist 시 추가되는게 아니라 덮어 쓰여짐 .. => 해결 ! sotrage_context 지정안해줘서 그럼 
# storage context.from_defaults(persist_dir ~)은 docstore.json 파일이 있어야만 지정할 수 있는데 처음 인덱스 생성시에는 해당 부분 생략해서 docstore.json 파일 먼저 생성하기
index.set_index_id('loan_tmp')

In [None]:
index.index_id

### Index data insert

### Save Index 

In [None]:
index.storage_context.persist(persist_dir=persist_dir)

### Index Load 

In [None]:
index_path

In [None]:
from llama_index import StorageContext, load_index_from_storage, load_indices_from_storage

desc_storage_context = StorageContext.from_defaults(persist_dir=os.path.join(index_path, 'desc'))
features_storage_context = StorageContext.from_defaults(persist_dir=os.path.join(index_path, 'features'))
qualification_storage_context = StorageContext.from_defaults(persist_dir=os.path.join(index_path, 'qualification'))

In [None]:
features_idx = load_index_from_storage(features_storage_context, index_id='loan_tmp')
desc_idx = load_indices_from_storage(desc_storage_context, index_ids=['card_tmp', 'deposit_tmp'])
qualification_idx = load_index_from_storage(qualification_storage_context)