In [None]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:90% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:2px;}
div.CodeMirror {font-family:Consolas; font-size:10pt;}
div.text_cell_render.rendered_html{font-size:10pt;}
div.output {font-size:10pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:10pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:11pt;padding:4px;}
table.dataframe{font-size:10px;}
</style>
"""))

# 문장 -> 벡터(1차원 숫자 배열[8.1,9.1,2,5,4,3,2...])
- openAi API : https://platform.openai.com/ 의 키(OPENAI_API_KEY)를 .env등록
- upstage : https://console.upstage.ai/ 의 키(UPSTAGE_API_KEY)를 .env등록
# 1. 환경 변수 load

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

# 2. 유사도 계산하는 방법 : https://www.pinecone.io/learn/vector-similarity/
    1. 큐클리드 거리 : 두 벡터간의 거리가 가까운지
    2. 코사인유사도 : 두 벡터간 방향이 유사한지
    3. dot product : 두 벡터간의 곱을 사용하여 거리와 방향을 모두 고려


In [2]:
import numpy as np
def cosine_similarity(vec1, vec2):
    """두 백터 사이의 코사인 유사도 계산"""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)  # 벡터의 길이
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1==0 or norm_vec2==0:
        return 0.0
    return dot_product / (norm_vec1*norm_vec2)


# 3. openAI API 의 embedding model 사용

In [3]:
from openai import OpenAI
openai_client = OpenAI()

In [4]:
# text-embedding-3-large
response = openai_client.embeddings.create(
    input="king",
    model="text-embedding-3-large"
)

In [9]:
import numpy as np
king_vecter = np.array(response.data[0].embedding)
print(king_vecter.shape)
print(king_vecter)

(3072,)
[ 0.01040417  0.02499519 -0.0014776  ...  0.00835009  0.01049861
 -0.00254005]


In [10]:
queen_response = openai_client.embeddings.create(
    input="queen",
    model="text-embedding-3-large"
)

In [12]:
queen_vector = np.array(queen_response.data[0].embedding)
print(queen_vector.shape)
print(queen_vector)

(3072,)
[-0.01385735  0.0008602  -0.0167823  ...  0.00017693  0.01159847
  0.00638929]


In [16]:
king_queen_similarity = cosine_similarity(king_vecter, queen_vector)
print('king과 queen의 유사도 :',king_queen_similarity)

king과 queen의 유사도 : 0.5552268369726675


In [14]:
slave_response = openai_client.embeddings.create(
    input="slave",
    model="text-embedding-3-large"
)

In [15]:
slave_vecter = np.array(slave_response.data[0].embedding)
print(slave_vecter.shape)
print(slave_vecter)

(3072,)
[-0.01999537  0.00620363  0.01191717 ...  0.00094749 -0.02679118
 -0.0058524 ]


In [17]:
king_slave_similarity = cosine_similarity(king_vecter, slave_vecter)
print('king과 slave의 유사도 :', king_slave_similarity)

king과 slave의 유사도 : 0.2947745074537996


In [40]:
# 한국오 문장을 벡터로 바꿔도 유사도는 비슷해야 할 듯

In [18]:
kor_king_response = openai_client.embeddings.create(
    input="왕",
    model="text-embedding-3-large"
)

In [20]:
kor_king_vecter = np.array(kor_king_response.data[0].embedding)
print(kor_king_vecter.shape)

(3072,)


In [22]:
kor_queen_response = openai_client.embeddings.create(
    input="여왕",
    model="text-embedding-3-large"
)

In [23]:
kor_queen_vecter = np.array(kor_queen_response.data[0].embedding)
print(kor_queen_vecter.shape)

(3072,)


In [None]:
# 왕과 여왕의 유사도
cosine_similarity(kor_king_vecter, kor_queen_vecter)

np.float64(0.48733449549538954)

In [25]:
kor_slave_response = openai_client.embeddings.create(
    input="거지",
    model="text-embedding-3-large"
)

In [26]:
kor_slave_vecter = np.array(kor_slave_response.data[0].embedding)
print(kor_slave_vecter.shape)

(3072,)


In [None]:
# 왕과 거지의 유사도
cosine_similarity(kor_king_vecter, kor_slave_vecter)

np.float64(0.2552452064791607)

In [28]:
# king과 왕의 유사도
cosine_similarity(king_vecter, kor_king_vecter)

np.float64(0.5474873912140233)

# 4. upstage의 embedding model 사용
- 한국어 embedding에는 openai보다 성능이 훨신 좋다

In [29]:
import os
upstage_api_key = os.getenv("UPSTAGE_API_KEY")
upstage_client = OpenAI(
    api_key=upstage_api_key,
    base_url="https://api.upstage.ai/v1"
)

In [30]:
up_king_response = upstage_client.embeddings.create(
    input="king",
    model="embedding-query"
)

In [33]:
up_king_vecter = np.array(up_king_response.data[0].embedding)
print(up_king_vecter.shape)
print(up_king_vecter)

(4096,)
[-0.01187134 -0.02058411 -0.00674438 ... -0.01082611  0.00244713
  0.01517487]


In [34]:
up_queen_response = upstage_client.embeddings.create(
    input="queen",
    model="embedding-query"
)

In [35]:
up_queen_vecter = np.array(up_queen_response.data[0].embedding)
print(up_queen_vecter.shape)
print(up_queen_vecter)

(4096,)
[-0.0016222  -0.00952148 -0.00471878 ...  0.00985718 -0.00732803
  0.0259552 ]


In [36]:
cosine_similarity(up_king_vecter, up_queen_vecter)

np.float64(0.6277983746920601)

In [37]:
up_kor_king_response = upstage_client.embeddings.create(
    input="왕",
    model="embedding-query"
)

In [38]:
up_kor_king_vecter = np.array(up_kor_king_response.data[0].embedding)
print(up_kor_king_vecter.shape)
print(up_kor_king_vecter)

(4096,)
[-0.01210022 -0.02249146 -0.01314545 ... -0.00024557  0.00358391
  0.01416779]


In [39]:
cosine_similarity(up_king_vecter, up_kor_king_vecter)

np.float64(0.8521901935963604)