In [2]:
import chromadb
import os 
import torch
import openai
import getpass
import pandas as pd 
import numpy as np 
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
from llama_index.embeddings import HuggingFaceEmbedding

In [3]:
data_path = os.path.join('/workspace/data/')

In [5]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
openai.api_key = os.environ["OPENAI_API_KEY"]

OpenAI API Key: ········


In [6]:
card = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_CARD_vec.csv'))
deposit = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_DEPOSIT_vec.csv'))
loan = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_LOAN_vec.csv'))

In [7]:
print(f'card table columns: {card.columns}', end='\n\n')
print(f'deposit table columns: {deposit.columns}', end='\n\n')
print(f'loan table columns: {loan.columns}', end='\n\n')

card table columns: Index(['카드_고유키', '카드설명'], dtype='object')

deposit table columns: Index(['보증_고유키', '상품설명'], dtype='object')

loan table columns: Index(['대출_고유키', '상품특징', '대출신청자격', '상품설명'], dtype='object')



In [8]:
card.isna().sum()

카드_고유키      0
카드설명      110
dtype: int64

In [9]:
model_name = 'kakaobank/kf-deberta-base'

In [10]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [11]:
hugging_face_ef = embedding_functions.HuggingFaceEmbeddingFunction(
	api_key='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',
	model_name=model_name
)

##### HuggingFaceEmbedding Func <-> AutoTokenizer Embedding 값 일치 여부 확인 

In [12]:
tmp_emb = hugging_face_ef(['kakaobank/kf-deberta-base'])
np.mean(np.squeeze(tmp_emb), axis=0)[:3]

array([0.71556843, 0.85814735, 0.33519265])

In [17]:
def get_emb(tokenizer, model, doc):
    encoded_input = tokenizer(doc, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings= list(map(float, sentence_embeddings[0]))
    return sentence_embeddings

In [18]:
def get_metadata(columns, data, idx):
    meta_dict = dict()
    for col in columns:
        try:
            assert meta_dict[col]
        except:
            meta_dict[col] = data[col][idx]
    return meta_dict

In [19]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [20]:
get_emb(tokenizer, model, '새로운 세상')[:3]

[-0.5291522741317749, 0.8925473690032959, -1.7507115602493286]

In [21]:
hugging_face_ef

<chromadb.utils.embedding_functions.HuggingFaceEmbeddingFunction at 0x7fa180dff1c0>

### ChromaDB 

In [32]:
from chromadb.utils import embedding_functions

In [None]:
chroma_client = chromadb.HttpClient(host='172.19.240.1', port=8000)

In [None]:
chroma_card = chroma_client.get_collection(name='card')
chroma_loan = chroma_client.get_collection(name='loan')
chroma_deposit = chroma_client.get_collection(name='deposit')

In [24]:
db.list_collections()

[]

In [None]:
loan.columns

In [None]:
meta_col = ['대출_고유키']
vector_col = ['상품특징']

In [None]:
start_idx = 0 

In [None]:
for col in vector_col: 
    # tb_docs = deposit.dropna(subset=col, axis=0)
    loan.reset_index(inplace=True, drop=True)
    documents = loan[col].values.tolist()
    
    ids = []; docs = []; embeddings = []; metadatas = []
    
    for idx in range(len(loan)):
        try:
            emb = get_emb(tokenizer, model, documents[idx])
        except:
            continue 
        start_idx += 1
        ids.append(str(start_idx))
        docs.append(documents[idx])
        
        # print(np.shape(emb))
        embeddings.append(emb)
        meta_dict = get_metadata(meta_col, loan, idx)
        metadatas.append(meta_dict)

    chroma_features.add(
    ids=ids,
    documents=docs, 
    embeddings=embeddings,
    metadatas=metadatas
    )
    print(f'column: {col}완료, chroma_feat 데이터 개수: {chroma_features.count()}')

In [None]:
txt = ' 대출'
emb = get_emb(tokenizer, model, txt)

results = chroma_desc.query(
    query_embeddings = emb,
    n_results = 3
)
results

### Persistent Directory 

In [57]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext 
from llama_index.vector_stores import ChromaVectorStore 
from llama_index.storage.storage_context import StorageContext 
from pathlib import Path
from llama_index import download_loader

In [26]:
db = chromadb.PersistentClient(path='/workspace/db/chroma')

In [27]:
embed_model = HuggingFaceEmbedding(model_name=model_name)

In [29]:
db.list_collections()

[Collection(name=features)]

In [28]:
chroma_features = db.create_collection(name='features', embedding_function=hugging_face_ef)

In [105]:
reader = SimpleDirectoryReader(input_files=['/workspace/data/financial/SHINHAN BANK_Financial_Product_Scraping_Result_20240104_CARD_vec.csv'])
reader.load_data()[0].text

'카드_0, nan\n카드_1, 바다위새로운세상휴먼라이프크루즈여행을위한맞춤혜택\n카드_2, 7개업종중선택한1개업종가맹점에서카드사용시7%청구할인\n카드_3, 법인/개인사업자에최적화된혜택을드리는카드\n카드_4, 국고보조금집행시사용되는카드\n카드_5, 국고보조금집행시사용되는카드로써국고보조금을교부받는보조사업자에게발급해드리는카드\n카드_6, 국가바우처통합카드\n카드_7, 특별한1%의당신을위해남다른품격과가치의시작\n카드_8, 특별한1%의당신을위해남다른품격과가치의시작\n카드_9, 성공비즈니스를위한든든한파트너\n카드_10, ILoveDokdo\n카드_11, 대한민국우리의땅독도를지킵시다\n카드_12, 신용카드고유기능을이용하는법인신용카드로포인트약정기업회원연안여객포인트적립,바다마트청구할인\n카드_13, 행복한생활,쇼핑의즐거움\n카드_14, 생활도Green,카드도Green일상속에서손쉽게녹색생활을실천하고그린카드로우리가족생활비절약은물론다양한혜택까지~\n카드_15, 투명하고효율적인정부예산집행정부관서모든예산과목의운영경비를정부구매카드로지출하세요.\n카드_16, 회원의카드이용금액에따라제휴업체인대한항공의스카이패스마일리지를적립해주는상품\n카드_17, 수협은행제휴학교전용체크카드\n카드_18, 국고보조금집행시사용되는카드로써국고보조금을교부받는보조사업자및대상자에게발급해드리는체크카드\n카드_19, 국고보조금집행시사용되는카드로써국고보조금을교부받는보조사업자및대상자에게발급해드리는체크카드\n카드_20, 국가바우처통합카드\n카드_21, 생활도Green,카드도Green일상속에서손쉽게녹색생활을실천하고그린카드로우리가족생활비절약은물론다양한혜택까지~\n카드_22, 제주도청에서민간(개인및단체)에지원되는보조금예산에대한집행을수협은행에서발행된보조금전용카드를통하여집행하도록발행된카드\n카드_23, 코웨이 렌탈요금 청구할인!\n카드_24, SK매직 렌탈요금 청구할인!\n카드_25, LG전자 구독요금 청구할인과 장기할부 서비스!\n카드_26, 바디프랜드 렌탈요금 청구할인과 장기할부 서비스!\n카드_27, 넥센타이어 렌탈요금 청구할인!\

In [95]:
PandasCSVReader = download_loader("SimpleCSVReader")

loader = SimpleCSVReader(encoding='utf-8')
documents = loader.load_data(file=Path('/workspace/data/financial/SHINHAN BANK_Financial_Product_Scraping_Result_20240104_CARD_vec.csv'))

In [100]:
PandasCSVReader = download_loader("PandasCSVReader")

loader = PandasCSVReader()
documents = loader.load_data(file=Path('/workspace/data/financial/SHINHAN BANK_Financial_Product_Scraping_Result_20240104_CARD_vec.csv'))

In [101]:
documents

[Document(id_='e117e2fe-466a-4817-9f7c-1e6e98929190', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='dbdc7054fdaf98e91be0f7bdd125389080e0d815a1cb1af76d3a89a928b4f6c6', text='카드_0, nan\n카드_1, 바다위새로운세상휴먼라이프크루즈여행을위한맞춤혜택\n카드_2, 7개업종중선택한1개업종가맹점에서카드사용시7%청구할인\n카드_3, 법인/개인사업자에최적화된혜택을드리는카드\n카드_4, 국고보조금집행시사용되는카드\n카드_5, 국고보조금집행시사용되는카드로써국고보조금을교부받는보조사업자에게발급해드리는카드\n카드_6, 국가바우처통합카드\n카드_7, 특별한1%의당신을위해남다른품격과가치의시작\n카드_8, 특별한1%의당신을위해남다른품격과가치의시작\n카드_9, 성공비즈니스를위한든든한파트너\n카드_10, ILoveDokdo\n카드_11, 대한민국우리의땅독도를지킵시다\n카드_12, 신용카드고유기능을이용하는법인신용카드로포인트약정기업회원연안여객포인트적립,바다마트청구할인\n카드_13, 행복한생활,쇼핑의즐거움\n카드_14, 생활도Green,카드도Green일상속에서손쉽게녹색생활을실천하고그린카드로우리가족생활비절약은물론다양한혜택까지~\n카드_15, 투명하고효율적인정부예산집행정부관서모든예산과목의운영경비를정부구매카드로지출하세요.\n카드_16, 회원의카드이용금액에따라제휴업체인대한항공의스카이패스마일리지를적립해주는상품\n카드_17, 수협은행제휴학교전용체크카드\n카드_18, 국고보조금집행시사용되는카드로써국고보조금을교부받는보조사업자및대상자에게발급해드리는체크카드\n카드_19, 국고보조금집행시사용되는카드로써국고보조금을교부받는보조사업자및대상자에게발급해드리는체크카드\n카드_20, 국가바우처통합카드\n카드_21, 생활도Green,카드도Green일상속에서손쉽게녹색생

In [99]:
documents[0].metadata

{}

In [81]:
data = pd.read_csv(os.path.join(data_path, 'financial', 'SHINHAN BANK_Financial_Product_Scraping_Result_20240104_CARD_vec.csv'))
data.dropna(subset=['카드설명'], inplace=True)
documents = data.카드설명.values.tolist()

In [83]:
tmp = data[['카드설명']]
tmp.to_csv(os.path.join(data_path, 'tmp.csv'), index=False)

In [None]:
data

In [88]:
from langchain.document_loaders.csv_loader import CSVLoader

In [107]:
documents = loader.load_data(file=Path('/workspace/data/tmp.csv'))

In [90]:
loader = CSVLoader(file_path='/workspace/data/tmp.csv')
documents = loader.load()

In [91]:
documents

[Document(page_content='카드설명: 바다위새로운세상휴먼라이프크루즈여행을위한맞춤혜택', metadata={'source': '/workspace/data/tmp.csv', 'row': 0}),
 Document(page_content='카드설명: 7개업종중선택한1개업종가맹점에서카드사용시7%청구할인', metadata={'source': '/workspace/data/tmp.csv', 'row': 1}),
 Document(page_content='카드설명: 법인/개인사업자에최적화된혜택을드리는카드', metadata={'source': '/workspace/data/tmp.csv', 'row': 2}),
 Document(page_content='카드설명: 국고보조금집행시사용되는카드', metadata={'source': '/workspace/data/tmp.csv', 'row': 3}),
 Document(page_content='카드설명: 국고보조금집행시사용되는카드로써국고보조금을교부받는보조사업자에게발급해드리는카드', metadata={'source': '/workspace/data/tmp.csv', 'row': 4}),
 Document(page_content='카드설명: 국가바우처통합카드', metadata={'source': '/workspace/data/tmp.csv', 'row': 5}),
 Document(page_content='카드설명: 특별한1%의당신을위해남다른품격과가치의시작', metadata={'source': '/workspace/data/tmp.csv', 'row': 6}),
 Document(page_content='카드설명: 특별한1%의당신을위해남다른품격과가치의시작', metadata={'source': '/workspace/data/tmp.csv', 'row': 7}),
 Document(page_content='카드설명: 성공비즈니스를위한든든한파트너', metadata={'source': '/workspace/da