In [27]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

import pandas as pd
from langchain.embeddings import OpenAIEmbeddings

import os
from dotenv import load_dotenv

load_dotenv("../../_apikeys.env")
api_key = os.getenv("Doogie.2ndKey")
os.environ['OPENAI_API_KEY'] = api_key

embeddings = OpenAIEmbeddings( model="text-embedding-ada-002" )
query_result = embeddings.embed_query( "나는 배가 아주 많이 고픈것 같다. 그렇기 때문에 나는 식사를 할 것인지, 아니면 굶을 것인지 고민이다.")

print(query_result);


[-0.0016603805322588795, -0.041218784197535475, 0.015363365594808095, -0.016526278305263795, -0.01843862257515151, 0.0031172514798863994, -0.019265583014553705, -0.005733804379919784, -0.024330713144755014, -0.010052732311755031, -0.0069064079675411096, 0.019769510489811383, -0.00991059851533661, -0.029202023868998648, -0.005019905571100432, 0.015079098001971254, 0.028659331022433298, 0.0009997816642657884, 0.0047162557582710474, -0.02338746031672015, -0.012307490368795927, -0.010227169032058869, 0.02097118670892958, -0.01849030725336818, -0.002543870855629904, 0.015453813781687262, 0.018748732507096688, -0.011474069344616653, -0.011693730158360074, -0.004981141596776639, 0.047627725274595994, -0.01345102039359779, 0.0005507682282907341, 0.008011174483680474, 0.011758336937453494, 0.0043318486357281204, 0.003472585738101805, 0.004994062766330806, -0.015440892612133095, -0.01533752325569976, -0.013244280749408536, -0.00796595039024089, 0.009697397355047686, -0.013851579443744719, 0.0175

In [2]:
print(query_result.__sizeof__())
print(len(query_result))

12328
1536


In [28]:
somedata = [ '저는 배가 고파요',
             '저기 배가 지나 가네요',
             '굶어서 허기가 지네요',
             '허기 워리라는 게임이 즐거워요',
             '스팀에서 재미있는 것 해야지',
             '스팀으로 연어구이 해 먹을 거야',
             ]
#'저기 있는 배가 맛있을 것 같아요'
df = pd.DataFrame(somedata, columns=['SampleText'])
df

Unnamed: 0,SampleText
0,저는 배가 고파요
1,저기 배가 지나 가네요
2,굶어서 허기가 지네요
3,허기 워리라는 게임이 즐거워요
4,스팀에서 재미있는 것 해야지
5,스팀으로 연어구이 해 먹을 거야


In [29]:
def get_embedding(text):
    return embeddings.embed_query(text)

df['EmbedVec'] = df.apply(lambda row: get_embedding(row.SampleText), axis=1)
df


Unnamed: 0,SampleText,EmbedVec
0,저는 배가 고파요,"[-0.016637360179694062, -0.021788898203996463,..."
1,저기 배가 지나 가네요,"[-0.0052657917147770515, -0.024830033460230574..."
2,굶어서 허기가 지네요,"[-0.006198894665266342, -0.006981533449335017,..."
3,허기 워리라는 게임이 즐거워요,"[-0.013712686737581248, -0.00977461896104763, ..."
4,스팀에서 재미있는 것 해야지,"[-0.0070508659839374314, -0.01768647064774106,..."
5,스팀으로 연어구이 해 먹을 거야,"[-0.0020257684433236355, -0.028546121671408266..."


In [30]:
for embedVec in df.EmbedVec:
    print( len(embedVec), embedVec[:2] )

1536 [-0.016637360179694062, -0.021788898203996463]
1536 [-0.0052657917147770515, -0.024830033460230574]
1536 [-0.006198894665266342, -0.006981533449335017]
1536 [-0.013712686737581248, -0.00977461896104763]
1536 [-0.0070508659839374314, -0.01768647064774106]
1536 [-0.0020257684433236355, -0.028546121671408266]


In [33]:
def cos_sim(a, b):
    return dot(a, b) / (norm(a) * norm(b))

def calc_sim(df, queryValue):
    queryValue_embedding = get_embedding(queryValue)
    #df['Similarity'] = df.EmbedVec.apply( cos_sim, args=(queryValue_embedding,))
    df['Similarity'] = df.EmbedVec.apply( lambda row: cos_sim(row, queryValue_embedding) )
    #df['Similarity'] = df.EmbedVec.apply( lambda row: cos_sim(np.array(row), np.array(queryValue_embedding) ) )
    #df['Similarity'] = df.apply( lambda row: cos_sim(np.array(row.EmbedVec), np.array(queryValue_embedding) ), axis=1)
    #df['Similarity'] = df.apply( lambda row: cos_sim(row.EmbedVec, queryValue_embedding), axis=1)
    #df['Similarity'] = df.apply( lambda row: cos_sim(row.EmbedVec, queryValue_embedding), axis=1)

calc_sim(df, "아무것도 안 먹었더니 꼬르륵 소리가 나네")
df


Unnamed: 0,SampleText,EmbedVec,Similarity
0,저는 배가 고파요,"[-0.016637360179694062, -0.021788898203996463,...",0.812184
1,저기 배가 지나 가네요,"[-0.0052657917147770515, -0.024830033460230574...",0.809394
2,굶어서 허기가 지네요,"[-0.006198894665266342, -0.006981533449335017,...",0.836796
3,허기 워리라는 게임이 즐거워요,"[-0.013712686737581248, -0.00977461896104763, ...",0.785841
4,스팀에서 재미있는 것 해야지,"[-0.0070508659839374314, -0.01768647064774106,...",0.788797
5,스팀으로 연어구이 해 먹을 거야,"[-0.0020257684433236355, -0.028546121671408266...",0.826971


In [34]:
def get_top3(df):
    df_top3 = df.sort_values(by=['Similarity'], ascending=False).head(3)
    return df_top3

result_top3 = get_top3(df)
result_top3

Unnamed: 0,SampleText,EmbedVec,Similarity
2,굶어서 허기가 지네요,"[-0.006198894665266342, -0.006981533449335017,...",0.836796
5,스팀으로 연어구이 해 먹을 거야,"[-0.0020257684433236355, -0.028546121671408266...",0.826971
0,저는 배가 고파요,"[-0.016637360179694062, -0.021788898203996463,...",0.812184
