In [3]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:99% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:0px;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.text_cell_render.rendered_html{font-size:12pt;}
div.text_cell_render ul li, div.text_cell_render ol li p, code{font-size:12pt; line-height:30px;}
div.output {font-size:12pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:12px;}
</style>
"""))

# 문장 -> Embedding Vector(1차원 숫자 배열)
- openAI API의 OPENAI_API_KEY (text-embedding-3-large)를 .env에 추가
- upstage(https://console.upstage.ai/docs/getting-started)의 키를 .env에 추가
# 1. 환경변수 load

In [46]:
from dotenv import load_dotenv
import os
load_dotenv(
    #dotenv_path='.env'
)
upstage_key = os.getenv('UPSTAGE_API_KEY')

# 2. 유사도 계산하는 방법
- 1. 유클리드 거리 : 두 벡터간 거리가 가까운지
- 2. cos 유사도 : 두 벡터간 방향이 유사한지
- 3. dot product : 두 벡터간 곱을 사용하여 거리와 방향을 모두 고려

In [5]:
import numpy as np
def cosine_similarity(vec1, vec2):
    """두 백터 사이의 코사인 유사도 계산"""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1) 
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1==0 or norm_vec2==0:
        return 0.0
    return dot_product / (norm_vec1*norm_vec2)

# 3. openAI API의 embedding model 사용
- text-embedding-3-large

In [6]:
from openai import OpenAI
openai_client = OpenAI()

In [21]:
# text-embedding-3-large(과금)
response = openai_client.embeddings.create(
    input = "The king is the prince's father",
    model='text-embedding-3-large'
)

In [22]:
response.usage

Usage(prompt_tokens=7, total_tokens=7)

In [23]:
king_vector = np.array(response.data[0].embedding)
king_vector

array([ 0.03364377,  0.03378009, -0.00439973, ...,  0.01654925,
        0.01048982, -0.00589925], shape=(3072,))

In [14]:
queen_response = openai_client.embeddings.create(
    input = "The queen is the prince's mother",
    model='text-embedding-3-large'
)

In [15]:
queen_response.usage

Usage(prompt_tokens=7, total_tokens=7)

In [17]:
queen_vector = np.array(queen_response.data[0].embedding)
queen_vector

array([ 0.01908419,  0.00117847, -0.00398764, ...,  0.00576628,
        0.00822409,  0.01082648], shape=(3072,))

In [24]:
# 두 벡터의 유사도
k_q_similarity = cosine_similarity(king_vector, queen_vector)
print(k_q_similarity)

0.6953682106144796


In [25]:
slave_response = openai_client.embeddings.create(
    input='The slave begs',
    model='text-embedding-3-large'
)

In [26]:
slave_vector = np.array(slave_response.data[0].embedding)
slave_vector

array([-0.00868506,  0.00686   ,  0.00754387, ..., -0.00823628,
       -0.0112752 ,  0.00194153], shape=(3072,))

In [28]:
king_slave_similarity = cosine_similarity(king_vector, slave_vector)
print(king_slave_similarity)

0.18329285510786322


In [29]:
response = openai_client.embeddings.create(
    input = "왕은 왕자의 아버지다",
    model='text-embedding-3-large'
)

In [30]:
kor_king_vector = np.array(response.data[0].embedding)
kor_king_vector

array([0.0146395 , 0.00812012, 0.0030026 , ..., 0.00722758, 0.01146712,
       0.00845967], shape=(3072,))

In [31]:
kor_queen_response = openai_client.embeddings.create(
    input = "여왕은 왕자의 어머니다",
    model='text-embedding-3-large'
)

In [33]:
kor_queen_vector = np.array(kor_queen_response.data[0].embedding)
kor_queen_vector

array([-0.00633463,  0.01295298,  0.00534164, ..., -0.00638844,
       -0.00050108,  0.0337521 ], shape=(3072,))

In [36]:
kor_similarity = cosine_similarity(kor_king_vector, kor_queen_vector)
print(kor_similarity)

0.5743527165183386


In [38]:
slave_response = openai_client.embeddings.create(
    input='노예가 구걸한다',
    model = 'text-embedding-3-large'
)

In [39]:
kor_slave_vector = np.array(slave_response.data[0].embedding)
kor_slave_vector

array([ 0.00524604,  0.0030029 ,  0.00265426, ..., -0.01131434,
       -0.00262631,  0.01126829], shape=(3072,))

In [40]:
print(cosine_similarity(kor_king_vector, kor_slave_vector))

0.13567145325103566


In [42]:
print(cosine_similarity(king_vector, kor_king_vector))

0.6247880288755789


# 4. upstage의 embedding model 사용
- 한국에서는 openai보다 성능이 좋음

In [47]:
upstage_client = OpenAI(
    api_key=upstage_key,
    base_url='https://api.upstage.ai/v1'
)

In [52]:
response = upstage_client.embeddings.create(
    input="The king is prince's father",
    model='embedding-query'
)

In [53]:
up_king_vector = np.array(response.data[0].embedding)
up_king_vector

array([ 6.99263182e-05, -3.04292757e-02, -2.85108201e-03, ...,
       -7.22163310e-03,  1.04733631e-02,  3.70790288e-02], shape=(4096,))

In [54]:
response_q = upstage_client.embeddings.create(
    input="The queen is prince's mother",
    model='embedding-query'
)
up_queen_vector = np.array(response_q.data[0].embedding)
up_queen_vector

array([-0.00610672, -0.01780602,  0.00733072, ...,  0.00306   ,
       -0.00264868,  0.04816383], shape=(4096,))

In [55]:
cosine_similarity(up_king_vector, up_queen_vector)

np.float64(0.6695324547493329)

In [56]:
response_kor_k = upstage_client.embeddings.create(
    input="왕은 왕자의 아버지다",
    model='embedding-query'
)
up_kor_king_vector = np.array(response_kor_k.data[0].embedding)
up_kor_king_vector

array([-0.00043774, -0.02250671, -0.01112366, ..., -0.00112629,
        0.01818848,  0.0147934 ], shape=(4096,))

In [57]:
response_kor_q = upstage_client.embeddings.create(
    input="여왕은 왕자의 어머니다",
    model='embedding-query'
)
up_kor_queen_vector = np.array(response_kor_q.data[0].embedding)
up_kor_queen_vector

array([-0.01259613, -0.00148392, -0.0004282 , ..., -0.00082159,
       -0.00722885,  0.0259552 ], shape=(4096,))

In [58]:
cosine_similarity(up_kor_king_vector, up_kor_queen_vector)

np.float64(0.6502499821716275)

In [59]:
cosine_similarity(up_king_vector, up_kor_king_vector)

np.float64(0.8495946530745893)

In [60]:
cosine_similarity(up_queen_vector, up_kor_queen_vector)

np.float64(0.8024567619934905)