In [2]:
from llama_index import Document, VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.readers.chroma import ChromaReader
from llama_index.storage.storage_context import StorageContext
# from transformers import AutoTokenizer, AutoModel
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.schema import MetadataMode
from IPython.display import Markdown, display
import chromadb
import pandas as pd 
import openai
import os
import getpass

In [32]:
data_path = os.path.join('/workspace/data/')
index_path = os.path.join('/workspace/db/chroma-llama')
config_path = os.path.join('/workspace/config')

In [39]:
query = input("User: ")

User:  hey


In [38]:
query

'hey'

In [33]:
with open(os.path.join(config_path, 'config.json'), "r", encoding="utf-8") as f:
    config = json.load(f)

In [36]:
config['index_path']

'/rag/db/chorma-llama/'

In [4]:
chroma_client = chromadb.PersistentClient(path=index_path)
desc_collection = chroma_client.get_or_create_collection("desc")
feature_collection = chroma_client.get_or_create_collection("feature")
qualification_collection = chroma_client.get_or_create_collection("qualification")

In [5]:
desc_collection.count()   # 2566 

0

In [6]:
desc_store = ChromaVectorStore(chroma_collection=desc_collection)
feature_store = ChromaVectorStore(chroma_collection=feature_collection)
qualification_store = ChromaVectorStore(chroma_collection=qualification_collection)

In [7]:
desc_storage = StorageContext.from_defaults(vector_store=desc_store)
feature_storage = StorageContext.from_defaults(vector_store=feature_store)
qualification_storage = StorageContext.from_defaults(vector_store=qualification_store)

In [8]:
from llama_index import Document, VectorStoreIndex 
from llama_index.node_parser import SentenceSplitter

In [9]:
data = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_CARD_vec.csv'))
data.head(1)

Unnamed: 0,카드_고유키,상품명,카드설명
0,카드_1,휴먼라이프 S1 수협카드,바다위새로운세상휴먼라이프크루즈여행을위한맞춤혜택


In [10]:
len(data) - len(data[data.카드설명.isna()])

1023

In [11]:
nona = data.copy()

In [12]:
len(data)

1129

In [13]:
data[data.카드설명.isna()].index[:3]

Index([129, 130, 131], dtype='int64')

In [14]:
nona.dropna(subset='카드설명', inplace=True)
len(nona)

1023

In [15]:
category = 'card'

id_list = nona.카드_고유키.values.tolist()
name_list = nona.상품명.values.tolist()
text_list = nona.카드설명.values.tolist()

In [16]:
text_list[65]

'에코라서 가능한 혜택, 에코로 쌓는 더 많은 포인트'

In [17]:
len(id_list), len(text_list)

(1023, 1023)

In [18]:
documents = []

for idx in range(len(text_list)):
    doc = Document(text=text_list[idx], doc_id=id_list[idx], metadata={"category": category, "name": name_list[idx]}, \
                   excluded_llm_metadata_keys = ['category', 'name'])
    documents.append(doc)

In [19]:
print(f'embedding model see this: {documents[2].get_content(metadata_mode=MetadataMode.EMBED)}', end='\n\n')
print(f'LLM see this: {documents[2].get_content(metadata_mode=MetadataMode.LLM)}')

embedding model see this: category: card
name: biz top기업 신용카드

법인/개인사업자에최적화된혜택을드리는카드

LLM see this: 법인/개인사업자에최적화된혜택을드리는카드


In [20]:
from llama_index.node_parser import SentenceSplitter 

In [21]:
parser = SentenceSplitter(chunk_size=512, chunk_overlap=30)   # SentenceSplitter(chunk_size=1024, chunk_overlap=20)
# nodes = parser.get_nodes_from_documents(documents)

In [22]:
model_name = 'kakaobank/kf-deberta-base'
embed_model = HuggingFaceEmbedding(model_name=model_name, embed_batch_size=32)

In [23]:
service_context = ServiceContext.from_defaults(node_parser=parser, embed_model=embed_model, llm=None)
# storage_context = StorageContext.from_defaults(persist_dir=persist_dir)

LLM is explicitly disabled. Using MockLLM.


In [24]:
len(documents)

1023

In [25]:
# build index 
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context, storage_context=desc_storage
) 

In [26]:
index.index_id

'd842e738-3ab6-41e5-a9ca-9f637d5b7c2a'

In [452]:
index.as_query_engine().query('렌탈 요금 청구').response

'Context information is below.\n---------------------\nIATA 정산시스템을 이용하는 여행사 및 화물대리점에게 항공운임결제를 원활하게 할 수 있도록 지원하는 지급보증상품\n\n개인사업자대출119제도중하나로써채무상환에어려움을겪고있는개인사업자대출채무자에대해장기간분할상환할수있도록지원하는상품※개인사업자119제도:일시적자금사정악화등으로채무상환에어려움을겪고있는개인사업자대출채무자에대해만기연장등을통해채무상환부담을경감해줌으로써연체발생을예방하는제도\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: 렌탈 요금 청구\nAnswer: '

In [27]:
# set index_id to save multiple indexes to the same folder  -> persist 시 추가되는게 아니라 덮어 쓰여짐 .. => 해결 ! sotrage_context 지정안해줘서 그럼 
# storage context.from_defaults(persist_dir ~)은 docstore.json 파일이 있어야만 지정할 수 있는데 처음 인덱스 생성시에는 해당 부분 생략해서 docstore.json 파일 먼저 생성하기
index.set_index_id('scrapping_desc_chroma')

In [28]:
index.index_id

'scrapping_desc_chroma'

### Index data insert

In [444]:
data = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_LOAN_vec.csv'))
data.head(1)

Unnamed: 0,대출_고유키,금융상품명,상품특징,대출신청자격,상품설명
0,대출_0,혁신성장산업 지원자금,9대 선도산업 등 정부발표 혁신성장 분야에 대한 효율적 금융지원,○ (지원대상) 다음 중 하나에 해당하는 기업 혁신성장 정책금융협의회 선정 「혁신성...,9대 선도산업 등 정부발표 혁신성장 분야에 대한 효율적 금융지원


In [445]:
nona = data.copy() 
nona.dropna(subset='상품설명', inplace=True)
len(nona)

1343

In [446]:
category = 'loan'

id_list = nona.대출_고유키.values.tolist()
name_list = nona.금융상품명.values.tolist()
desc_list = nona.상품설명.values.tolist()

In [447]:
documents = []
for idx in range(len(desc_list)):
    doc = Document(text=desc_list[idx], doc_id=id_list[idx], metadata={"category": category, "name": name_list[idx]},
                   excluded_llm_metadata_keys = ['category', 'name'])
    documents.append(doc)

In [448]:
for doc in documents:
    index.insert(doc)

### Save Index 

In [450]:
index.storage_context.persist(persist_dir=os.path.join(index_path, 'desc'))

### Index Load 