In [2]:
import chromadb
import os 
import torch
import openai
import getpass
import pandas as pd 
import numpy as np 
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
from llama_index.embeddings import HuggingFaceEmbedding

In [3]:
data_path = os.path.join('/workspace/data/')

In [5]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
openai.api_key = os.environ["OPENAI_API_KEY"]

OpenAI API Key: ········


In [6]:
card = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_CARD_vec.csv'))
deposit = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_DEPOSIT_vec.csv'))
loan = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_LOAN_vec.csv'))

In [7]:
print(f'card table columns: {card.columns}', end='\n\n')
print(f'deposit table columns: {deposit.columns}', end='\n\n')
print(f'loan table columns: {loan.columns}', end='\n\n')

card table columns: Index(['카드_고유키', '카드설명'], dtype='object')

deposit table columns: Index(['보증_고유키', '상품설명'], dtype='object')

loan table columns: Index(['대출_고유키', '상품특징', '대출신청자격', '상품설명'], dtype='object')



In [8]:
card.isna().sum()

카드_고유키      0
카드설명      110
dtype: int64

In [9]:
model_name = 'kakaobank/kf-deberta-base'

In [10]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [11]:
hugging_face_ef = embedding_functions.HuggingFaceEmbeddingFunction(
	api_key='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',
	model_name=model_name
)

##### HuggingFaceEmbedding Func <-> AutoTokenizer Embedding 값 일치 여부 확인 

In [12]:
tmp_emb = hugging_face_ef(['kakaobank/kf-deberta-base'])
np.mean(np.squeeze(tmp_emb), axis=0)[:3]

array([0.71556843, 0.85814735, 0.33519265])

In [17]:
def get_emb(tokenizer, model, doc):
    encoded_input = tokenizer(doc, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings= list(map(float, sentence_embeddings[0]))
    return sentence_embeddings

In [18]:
def get_metadata(columns, data, idx):
    meta_dict = dict()
    for col in columns:
        try:
            assert meta_dict[col]
        except:
            meta_dict[col] = data[col][idx]
    return meta_dict

In [19]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [20]:
get_emb(tokenizer, model, '새로운 세상')[:3]

[-0.5291522741317749, 0.8925473690032959, -1.7507115602493286]

In [21]:
hugging_face_ef

<chromadb.utils.embedding_functions.HuggingFaceEmbeddingFunction at 0x7fa180dff1c0>

### ChromaDB 

In [32]:
from chromadb.utils import embedding_functions

In [109]:
chroma_client = chromadb.HttpClient(host='172.19.240.1', port=8000)

In [112]:
chroma_client.list_collections()

[Collection(name=features),
 Collection(name=qualification),
 Collection(name=desc)]

In [115]:
chroma_feat = chroma_client.get_collection(name='features')
chroma_qualification = chroma_client.get_collection(name='qualification')
chroma_desc = chroma_client.get_collection(name='desc')

In [24]:
db.list_collections()

[]

In [None]:
loan.columns

In [None]:
meta_col = ['대출_고유키']
vector_col = ['상품특징']

In [None]:
start_idx = 0 

In [None]:
for col in vector_col: 
    # tb_docs = deposit.dropna(subset=col, axis=0)
    loan.reset_index(inplace=True, drop=True)
    documents = loan[col].values.tolist()
    
    ids = []; docs = []; embeddings = []; metadatas = []
    
    for idx in range(len(loan)):
        try:
            emb = get_emb(tokenizer, model, documents[idx])
        except:
            continue 
        start_idx += 1
        ids.append(str(start_idx))
        docs.append(documents[idx])
        
        # print(np.shape(emb))
        embeddings.append(emb)
        meta_dict = get_metadata(meta_col, loan, idx)
        metadatas.append(meta_dict)

    chroma_features.add(
    ids=ids,
    documents=docs, 
    embeddings=embeddings,
    metadatas=metadatas
    )
    print(f'column: {col}완료, chroma_feat 데이터 개수: {chroma_features.count()}')

In [141]:
txt = '떠나자 부산으로'
emb = get_emb(tokenizer, model, txt)

results = chroma_desc.query(
    query_embeddings = emb,
    n_results = 1
)

In [142]:
results

{'ids': [['1016']],
 'distances': [[295.2231750488281]],
 'embeddings': None,
 'metadatas': [[{'보증_고유키': '보증_1025'}]],
 'documents': [['월급의일부를모아목돈을만들어재테크의기반을만들고싶으신가요?그러나,많은투자상품가운데어떤상품을해야될지난감할때가있으시죠?망설이지마세요.재테크와목돈마련의시작은바로정기적금입니다.비과세종합저축으로도가능한웰컴저축은행의정기적금!재테크의시작!웰컴저축은행과시작하세요.예금자보호영업점']],
 'uris': None,
 'data': None}

In [143]:
txt = ' 대출'
emb = get_emb(tokenizer, model, txt)

results = chroma_desc.query(
    query_embeddings = emb,
    n_results = 1
)
results

{'ids': [['47c36ce8-c0f0-4736-9b03-8b4e06d3865b']],
 'distances': [[603.967529296875]],
 'embeddings': None,
 'metadatas': [[{'_node_content': '{"id_": "47c36ce8-c0f0-4736-9b03-8b4e06d3865b", "embedding": null, "metadata": {"category": "card"}, "excluded_embed_metadata_keys": [], "excluded_llm_metadata_keys": ["category"], "relationships": {"1": {"node_id": "\\uce74\\ub4dc_322", "node_type": "4", "metadata": {"category": "card"}, "hash": "46ff272690ff62091e2fd8a24e10ad4cb078ac8077aa01c672bdfdd006699c62", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "3af27b94-8b76-4a4b-98ca-a6a8f1d6a9fe", "node_type": "1", "metadata": {"category": "card"}, "hash": "e9fde142fae42945cd30091350eb7e4c61f8c8a2348624d0952d53c9ca0e56d4", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "070c4640-8a1e-4ac3-a8dc-fb900c1c04ef", "node_type": "1", "metadata": {}, "hash": "185587989fa8ed42ffcb8c5c3a33ebc3a8c78c4d83b5f25b638ff7793ddb064a", "class_name": "RelatedNodeInfo"}}, "hash": "c9b157bd5c1b82983e332a2064