### 임베딩

### 학습 내용
 * 텍스트 임베딩(Text Embedding) 이해하기
 * 여러 입력에 대한 임베딩
 * 시멘틱 탐색
 * 코사인 유사도 예제

### 사전 준비
 * 구글 코랩 환경은 일정 시간이후에 초기화가 되기 때문에 두가지 작업을 매번 수행해야 함.
   * chatgpt.env 파일 생성이 필요.
     * 준비된 chatgpt.env를 내용을 변경하여 업로드 하거나 또는 API_KEY와 ORG_ID를 확인하여 생성한다.
   * pip install openai 설치

In [1]:
!pip install openai

Collecting openai
  Downloading openai-1.3.7-py3-none-any.whl (221 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/221.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m215.0/221.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py

In [2]:
import matplotlib
import plotly
import scipy
import sklearn

### 텍스트 임베딩 이해하기

In [6]:
import os
from openai import OpenAI

def init_api():
    with open("chatgpt.env") as env:
       for line in env:
           key, value = line.strip().split("=")
           os.environ[key] = value

init_api()

client = OpenAI(api_key  = os.environ.get("API_KEY"),
                organization  = os.environ.get("ORG_ID"))


response = client.embeddings.create(
  model="text-embedding-ada-002",
  input="I am a programmer"
)

print(response)


CreateEmbeddingResponse(data=[Embedding(embedding=[-0.016873637214303017, -0.019692588597536087, -0.011235730722546577, -0.016432758420705795, -0.01112885121256113, 0.023152820765972137, -0.02554425410926342, -0.009338616393506527, -0.010273813270032406, -0.024194898083806038, 0.021135466173291206, 0.0031446018256247044, 0.005230426322668791, -0.01664651744067669, -0.009051376953721046, -0.012965846806764603, 0.019879627972841263, -0.0010036673629656434, -0.00904469657689333, -0.0035136709921061993, 0.004235109314322472, 0.01567124016582966, 0.021295785903930664, -0.0122377285733819, -0.013046006672084332, 0.0064628832042217255, -0.004739447962492704, -0.020267067477107048, 0.005140246823430061, -0.015150200575590134, 0.0337739922106266, -0.005577785428613424, 0.0023563639260828495, -0.01963914930820465, -0.014295163564383984, -0.016539636999368668, -0.0017735353903844953, 0.011322570964694023, 0.0017451455350965261, -0.0028523525688797235, 0.029953042045235634, 0.029498804360628128, -

In [7]:
response

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.016873637214303017, -0.019692588597536087, -0.011235730722546577, -0.016432758420705795, -0.01112885121256113, 0.023152820765972137, -0.02554425410926342, -0.009338616393506527, -0.010273813270032406, -0.024194898083806038, 0.021135466173291206, 0.0031446018256247044, 0.005230426322668791, -0.01664651744067669, -0.009051376953721046, -0.012965846806764603, 0.019879627972841263, -0.0010036673629656434, -0.00904469657689333, -0.0035136709921061993, 0.004235109314322472, 0.01567124016582966, 0.021295785903930664, -0.0122377285733819, -0.013046006672084332, 0.0064628832042217255, -0.004739447962492704, -0.020267067477107048, 0.005140246823430061, -0.015150200575590134, 0.0337739922106266, -0.005577785428613424, 0.0023563639260828495, -0.01963914930820465, -0.014295163564383984, -0.016539636999368668, -0.0017735353903844953, 0.011322570964694023, 0.0017451455350965261, -0.0028523525688797235, 0.029953042045235634, 0.029498804360628128, -

In [8]:
print(response.data[0].embedding)

[-0.016873637214303017, -0.019692588597536087, -0.011235730722546577, -0.016432758420705795, -0.01112885121256113, 0.023152820765972137, -0.02554425410926342, -0.009338616393506527, -0.010273813270032406, -0.024194898083806038, 0.021135466173291206, 0.0031446018256247044, 0.005230426322668791, -0.01664651744067669, -0.009051376953721046, -0.012965846806764603, 0.019879627972841263, -0.0010036673629656434, -0.00904469657689333, -0.0035136709921061993, 0.004235109314322472, 0.01567124016582966, 0.021295785903930664, -0.0122377285733819, -0.013046006672084332, 0.0064628832042217255, -0.004739447962492704, -0.020267067477107048, 0.005140246823430061, -0.015150200575590134, 0.0337739922106266, -0.005577785428613424, 0.0023563639260828495, -0.01963914930820465, -0.014295163564383984, -0.016539636999368668, -0.0017735353903844953, 0.011322570964694023, 0.0017451455350965261, -0.0028523525688797235, 0.029953042045235634, 0.029498804360628128, -0.008984576910734177, -0.0010095123434439301, -0.0

#### 여러 입력에 대한 임베딩

In [9]:
import os
from openai import OpenAI

def init_api():
    with open("chatgpt.env") as env:
       for line in env:
           key, value = line.strip().split("=")
           os.environ[key] = value

init_api()

client = OpenAI(api_key  = os.environ.get("API_KEY"),
                organization  = os.environ.get("ORG_ID"))

response = client.embeddings.create(
	model="text-embedding-ada-002",
	input=["I am a programmer", "I am a writer"]
)

for data in response.data:
  print(data.embedding)


[-0.016873637214303017, -0.019692588597536087, -0.011235730722546577, -0.016432758420705795, -0.01112885121256113, 0.023152820765972137, -0.02554425410926342, -0.009338616393506527, -0.010273813270032406, -0.024194898083806038, 0.021135466173291206, 0.0031446018256247044, 0.005230426322668791, -0.01664651744067669, -0.009051376953721046, -0.012965846806764603, 0.019879627972841263, -0.0010036673629656434, -0.00904469657689333, -0.0035136709921061993, 0.004235109314322472, 0.01567124016582966, 0.021295785903930664, -0.0122377285733819, -0.013046006672084332, 0.0064628832042217255, -0.004739447962492704, -0.020267067477107048, 0.005140246823430061, -0.015150200575590134, 0.0337739922106266, -0.005577785428613424, 0.0023563639260828495, -0.01963914930820465, -0.014295163564383984, -0.016539636999368668, -0.0017735353903844953, 0.011322570964694023, 0.0017451455350965261, -0.0028523525688797235, 0.029953042045235634, 0.029498804360628128, -0.008984576910734177, -0.0010095123434439301, -0.0

### 시맨틱 검색(Semantic Search) 탐색

#### 인증

In [10]:
from openai import OpenAI
import os
import pandas as pd
import numpy as np

def init_api():
    with open("chatgpt.env") as env:
       for line in env:
           key, value = line.strip().split("=")
           os.environ[key] = value

init_api()

client = OpenAI(api_key  = os.environ.get("API_KEY"),
                organization  = os.environ.get("ORG_ID"))


#### words.csv를 만들고, 이를 불러오기
 * words.csv파일의 내용은 책을 보고 생성을 해 주세요.구글 코랩에 업로드를 하거나 직접 words.csv 파일을 생성합니다.

In [11]:
df = pd.read_csv('words.csv')
print(df)

          text
0        apple
1       banana
2       cherry
3          dog
4          cat
5        house
6          car
7         tree
8        phone
9     computer
10  television
11        book
12       music
13        food
14       water
15         sky
16         air
17         sun
18        moon
19        star
20       ocean
21        desk
22         bed
23        sofa
24        lamp
25      carpet
26      window
27        door
28       floor
29     ceiling
30        wall
31       clock
32       watch
33     jewelry
34        ring
35    necklace
36    bracelet
37     earring
38      wallet
39         key
40       photo


In [13]:
# 데이터프레임의 각 단어에 대한 임베딩 가져오기
# 데이터프레임의 'text' 열에서 텍스트 리스트를 생성
text_list = df['text'].tolist()

print(text_list)

['apple',
 'banana',
 'cherry',
 'dog',
 'cat',
 'house',
 'car',
 'tree',
 'phone',
 'computer',
 'television',
 'book',
 'music',
 'food',
 'water',
 'sky',
 'air',
 'sun',
 'moon',
 'star',
 'ocean',
 'desk',
 'bed',
 'sofa',
 'lamp',
 'carpet',
 'window',
 'door',
 'floor',
 'ceiling',
 'wall',
 'clock',
 'watch',
 'jewelry',
 'ring',
 'necklace',
 'bracelet',
 'earring',
 'wallet',
 'key',
 'photo']

### 각 단어별 임베딩

In [16]:
response = client.embeddings.create(
	model="text-embedding-ada-002",
	input= text_list
)

embedding_all = []
for data in response.data:
  print(data.embedding)
  embedding_all.append(data.embedding)

[0.00777884665876627, -0.023069249466061592, -0.007360776886343956, -0.02774341218173504, -0.00457478454336524, 0.012891639955341816, -0.021863015368580818, -0.00858757272362709, 0.01892966963350773, -0.029854323714971542, -0.002796272747218609, 0.02010849118232727, -0.004530236124992371, 0.009129008278250694, -0.02145179733633995, 0.0020303819328546524, 0.030813828110694885, 9.7449759778101e-05, 0.0019172972533851862, -0.025687329471111298, -0.02098575234413147, -0.008066698908805847, 0.021342139691114426, -0.012226839549839497, 0.0009980568429455161, 0.005105939228087664, 0.009999416768550873, -0.00010740891593741253, 0.015845544636249542, -0.012980736792087555, 0.02057453617453575, -0.016160812228918076, -0.01851845346391201, 0.005263572093099356, -0.019286056980490685, -0.009293494746088982, -0.012096621096134186, -0.008854863233864307, -0.005753605160862207, -0.006157968193292618, 0.010540851391851902, 0.007724017836153507, -0.006555477622896433, 0.0005294413422234356, -0.02345305

In [18]:
dat = pd.Series( embedding_all )
dat

0     [0.00777884665876627, -0.023069249466061592, -...
1     [-0.013926557265222073, -0.03288617357611656, ...
2     [0.006517840549349785, -0.019012855365872383, ...
3     [-0.00337603478692472, -0.017694612964987755, ...
4     [-0.007100660353899002, -0.017430506646633148,...
5     [-0.007176146376878023, 0.007186704780906439, ...
6     [-0.007485327776521444, -0.021592551842331886,...
7     [-0.00465209037065506, -0.013106998056173325, ...
8     [-0.0014444905100390315, -0.02283545397222042,...
9     [-0.0030717267654836178, -0.014183077961206436...
10    [-0.004756690934300423, -0.019938383251428604,...
11    [-0.006829570047557354, -0.019116051495075226,...
12    [-0.0018608466489240527, -0.023303421214222908...
13    [0.022320540621876717, -0.026822732761502266, ...
14    [0.019045095890760422, -0.012522426433861256, ...
15    [0.0049745566211640835, -0.0014098514802753925...
16    [0.008967065252363682, -0.023472610861063004, ...
17    [0.02472955547273159, -0.00248753814958035

In [20]:

df['embedding'] = dat
print(df.shape)
df

(41, 2)


Unnamed: 0,text,embedding
0,apple,"[0.00777884665876627, -0.023069249466061592, -..."
1,banana,"[-0.013926557265222073, -0.03288617357611656, ..."
2,cherry,"[0.006517840549349785, -0.019012855365872383, ..."
3,dog,"[-0.00337603478692472, -0.017694612964987755, ..."
4,cat,"[-0.007100660353899002, -0.017430506646633148,..."
5,house,"[-0.007176146376878023, 0.007186704780906439, ..."
6,car,"[-0.007485327776521444, -0.021592551842331886,..."
7,tree,"[-0.00465209037065506, -0.013106998056173325, ..."
8,phone,"[-0.0014444905100390315, -0.02283545397222042,..."
9,computer,"[-0.0030717267654836178, -0.014183077961206436..."


In [22]:
# df의 내용을 csv파일로 저장하기
df.to_csv("embeddings.csv")

#### 시멘틱 탐색 코드

In [32]:
# csv 파일 읽기
df = pd.read_csv('embeddings.csv')

# embedding 열의 값을 문자열에서 Numpy 배열로 변환하여 저장
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

# 사용자로부터 검색 용어를 입력 받는다.
user_search = input('Enter a search term: ')

# 검색 용어에 대한 임베딩을 생성
response = client.embeddings.create(model="text-embedding-ada-002", input= user_search)



Enter a search term: office
<class 'list'>


In [37]:
search_term_embedding = np.array( response.data[0].embedding )
print(search_term_embedding)

[-0.00371421 -0.01933329 -0.0152726  ... -0.00363798  0.00375579
 -0.01516173]


In [38]:
search_term_embedding.shape

(1536,)

In [40]:
df['embedding'][0].shape

(1536,)

In [41]:
# search_term_embedding을 2차원 배열로 변환 (이미 2차원 배열이라면 이 단계는 생략)
search_term_embedding_2d = search_term_embedding.reshape(1, -1)

# 코사인 유사도 계산
df['similarity'] = df['embedding'].apply(lambda x: cosine_similarity([x], search_term_embedding_2d)[0][0])

print(df)

    Unnamed: 0        text                                          embedding  \
0            0       apple  [0.00777884665876627, -0.023069249466061592, -...   
1            1      banana  [-0.013926557265222073, -0.03288617357611656, ...   
2            2      cherry  [0.006517840549349785, -0.019012855365872383, ...   
3            3         dog  [-0.00337603478692472, -0.017694612964987755, ...   
4            4         cat  [-0.007100660353899002, -0.017430506646633148,...   
5            5       house  [-0.007176146376878023, 0.007186704780906439, ...   
6            6         car  [-0.007485327776521444, -0.021592551842331886,...   
7            7        tree  [-0.00465209037065506, -0.013106998056173325, ...   
8            8       phone  [-0.0014444905100390315, -0.02283545397222042,...   
9            9    computer  [-0.0030717267654836178, -0.014183077961206436...   
10          10  television  [-0.004756690934300423, -0.019938383251428604,...   
11          11        book  

In [42]:
df

Unnamed: 0.1,Unnamed: 0,text,embedding,similarity
0,0,apple,"[0.00777884665876627, -0.023069249466061592, -...",0.83024
1,1,banana,"[-0.013926557265222073, -0.03288617357611656, ...",0.805654
2,2,cherry,"[0.006517840549349785, -0.019012855365872383, ...",0.792175
3,3,dog,"[-0.00337603478692472, -0.017694612964987755, ...",0.828722
4,4,cat,"[-0.007100660353899002, -0.017430506646633148,...",0.802095
5,5,house,"[-0.007176146376878023, 0.007186704780906439, ...",0.874353
6,6,car,"[-0.007485327776521444, -0.021592551842331886,...",0.821579
7,7,tree,"[-0.00465209037065506, -0.013106998056173325, ...",0.825751
8,8,phone,"[-0.0014444905100390315, -0.02283545397222042,...",0.853143
9,9,computer,"[-0.0030717267654836178, -0.014183077961206436...",0.86156


#### 시멘틱 탐색 전체 코드

In [46]:
from openai import OpenAI
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def init_api():
    with open("chatgpt.env") as env:
       for line in env:
           key, value = line.strip().split("=")
           os.environ[key] = value

init_api()

client = OpenAI(api_key  = os.environ.get("API_KEY"),
                organization  = os.environ.get("ORG_ID"))

# words.csv 파일 불러오기
df = pd.read_csv('words.csv')


# 데이터프레임의 'text' 열에서 텍스트 리스트를 생성
text_list = df['text'].tolist()

response = client.embeddings.create(
	model="text-embedding-ada-002",
	input= text_list
)

embedding_all = []
for data in response.data:
  print(data.embedding)
  embedding_all.append(data.embedding)

# 임베딩 컬럼 생성
dat = pd.Series( embedding_all )
df['embedding'] = dat
print(df.shape)
print(df)

# 임베딩된 데이터 프레임 csv파일로 저장
df.to_csv("embeddings.csv", index=False)

# 임베딩 파일을 불러오기 및 사용자로부터 입력단어 임베딩 생성
# csv 파일 읽기
df = pd.read_csv('embeddings.csv')

# embedding 열의 값을 문자열에서 Numpy 배열로 변환하여 저장
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

# 사용자로부터 검색 용어를 입력 받는다.
user_search = input('Enter a search term: ')

# 검색 용어에 대한 임베딩을 생성
response = client.embeddings.create(model="text-embedding-ada-002", input= user_search)

# search_term_embedding을 2차원 배열로 변환 (이미 2차원 배열이라면 이 단계는 생략)
search_term_embedding_2d = search_term_embedding.reshape(1, -1)

# 코사인 유사도 계산
df['similarity'] = df['embedding'].apply(lambda x: cosine_similarity([x], search_term_embedding_2d)[0][0])

print(df)

[0.00777884665876627, -0.023069249466061592, -0.007360776886343956, -0.02774341218173504, -0.00457478454336524, 0.012891639955341816, -0.021863015368580818, -0.00858757272362709, 0.01892966963350773, -0.029854323714971542, -0.002796272747218609, 0.02010849118232727, -0.004530236124992371, 0.009129008278250694, -0.02145179733633995, 0.0020303819328546524, 0.030813828110694885, 9.7449759778101e-05, 0.0019172972533851862, -0.025687329471111298, -0.02098575234413147, -0.008066698908805847, 0.021342139691114426, -0.012226839549839497, 0.0009980568429455161, 0.005105939228087664, 0.009999416768550873, -0.00010740891593741253, 0.015845544636249542, -0.012980736792087555, 0.02057453617453575, -0.016160812228918076, -0.01851845346391201, 0.005263572093099356, -0.019286056980490685, -0.009293494746088982, -0.012096621096134186, -0.008854863233864307, -0.005753605160862207, -0.006157968193292618, 0.010540851391851902, 0.007724017836153507, -0.006555477622896433, 0.0005294413422234356, -0.02345305

In [47]:
df

Unnamed: 0,text,embedding,similarity
0,apple,"[0.00777884665876627, -0.023069249466061592, -...",0.83024
1,banana,"[-0.013926557265222073, -0.03288617357611656, ...",0.805654
2,cherry,"[0.006485373713076115, -0.018984820693731308, ...",0.792088
3,dog,"[-0.003338826121762395, -0.017762014642357826,...",0.828595
4,cat,"[-0.007100660353899002, -0.017430506646633148,...",0.802095
5,house,"[-0.007176146376878023, 0.007186704780906439, ...",0.874353
6,car,"[-0.007485327776521444, -0.021592551842331886,...",0.821579
7,tree,"[-0.00465209037065506, -0.013106998056173325, ...",0.825751
8,phone,"[-0.001412850571796298, -0.022908246144652367,...",0.853158
9,computer,"[-0.003079379675909877, -0.014275545254349709,...",0.861701


#### 검색 단어에 대한 상위 유사도 10개 출력하기

In [None]:
# 임베딩 파일을 불러오기 및 사용자로부터 입력단어 임베딩 생성
# csv 파일 읽기
df = pd.read_csv('embeddings.csv')

# embedding 열의 값을 문자열에서 Numpy 배열로 변환하여 저장
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

# 사용자로부터 검색 용어를 입력 받는다.
user_search = input('Enter a search term: ')

# 검색 용어에 대한 임베딩을 생성
response = client.embeddings.create(model="text-embedding-ada-002", input= user_search)

# search_term_embedding을 2차원 배열로 변환 (이미 2차원 배열이라면 이 단계는 생략)
search_term_embedding_2d = search_term_embedding.reshape(1, -1)

# 코사인 유사도 계산
df['similarity'] = df['embedding'].apply(lambda x: cosine_similarity([x], search_term_embedding_2d)[0][0])

# 'similarity'을 기준으로 데이터 프레임 정렬
df = df.sort_values(by='similarity', ascending=False)

# 유사도 10개 확인
print(df.head(10))

In [49]:
df.head(10)

Unnamed: 0,text,embedding,similarity
0,apple,"[0.00777884665876627, -0.023069249466061592, -...",0.83024
1,banana,"[-0.013926557265222073, -0.03288617357611656, ...",0.805654
2,cherry,"[0.006485373713076115, -0.018984820693731308, ...",0.792088
3,dog,"[-0.003338826121762395, -0.017762014642357826,...",0.828595
4,cat,"[-0.007100660353899002, -0.017430506646633148,...",0.802095
5,house,"[-0.007176146376878023, 0.007186704780906439, ...",0.874353
6,car,"[-0.007485327776521444, -0.021592551842331886,...",0.821579
7,tree,"[-0.00465209037065506, -0.013106998056173325, ...",0.825751
8,phone,"[-0.001412850571796298, -0.022908246144652367,...",0.853158
9,computer,"[-0.003079379675909877, -0.014275545254349709,...",0.861701


#### 시멘틱 탐색 - 한글 단어 입력 후, 코사인 유사도 확인해 보기

In [50]:
# 임베딩 파일을 불러오기 및 사용자로부터 입력단어 임베딩 생성
# csv 파일 읽기
df = pd.read_csv('embeddings.csv')

# embedding 열의 값을 문자열에서 Numpy 배열로 변환하여 저장
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

# 사용자로부터 검색 용어를 입력 받는다.
user_search = input('Enter a search term: ')

# 검색 용어에 대한 임베딩을 생성
response = client.embeddings.create(model="text-embedding-ada-002", input= user_search)

# search_term_embedding을 2차원 배열로 변환 (이미 2차원 배열이라면 이 단계는 생략)
search_term_embedding_2d = search_term_embedding.reshape(1, -1)

# 코사인 유사도 계산
df['similarity'] = df['embedding'].apply(lambda x: cosine_similarity([x], search_term_embedding_2d)[0][0])

# 'similarity'을 기준으로 데이터 프레임 정렬
df = df.sort_values(by='similarity', ascending=False)

# 유사도 10개 확인
print(df.head(10))

Enter a search term: 오피스
        text                                          embedding  similarity
21      desk  [0.012808148749172688, -0.020803043618798256, ...    0.890042
5      house  [-0.007176146376878023, 0.007186704780906439, ...    0.874353
9   computer  [-0.003079379675909877, -0.014275545254349709,...    0.861701
28     floor  [0.018671365454792976, -0.021179255098104477, ...    0.861466
8      phone  [-0.001412850571796298, -0.022908246144652367,...    0.853158
11      book  [-0.0068267760798335075, -0.019158145412802696...    0.838070
24      lamp  [0.006781177595257759, -0.008739246986806393, ...    0.833910
27      door  [-0.004832038655877113, -0.02689017727971077, ...    0.833217
40     photo  [0.004301978275179863, -0.031353630125522614, ...    0.833180
13      food  [0.022320540621876717, -0.026822732761502266, ...    0.831016


### 코사인 유사도 예제

#### numpy 라이브러리 이용한 코사인 유사도 예제

In [51]:
# numpy와 numpy.linalg에서 norm을 가져옵니다.
import numpy as np
from numpy.linalg import norm

# 두 벡터를 정의합니다.
A = np.array([2,3,5,2,6,7,9,2,3,4])
B = np.array([3,6,3,1,0,9,2,3,4,5])

# 벡터들을 출력합니다.
print("Vector A: {}".format(A))
print("Vector B: {}".format(B))

# 코사인 유사도를 계산합니다.
cosine = np.dot(A,B)/(norm(A)*norm(B))

# 코사인 유사도를 출력합니다.
print("Cosine Similarity between A and B: {}".format(cosine))


Vector A: [2 3 5 2 6 7 9 2 3 4]
Vector B: [3 6 3 1 0 9 2 3 4 5]
Cosine Similarity between A and B: 0.7539959431593041


#### Scipy 라이브러리를 활용한 예제

In [52]:
# 필요 라이브러리를 불러옵니다.
import numpy as np
from scipy import spatial

# 두 벡터들을 정의합니다.
A = np.array([2,3,5,2,6,7,9,2,3,4])
B = np.array([3,6,3,1,0,9,2,3,4,5])

# 벡터들을 출력합니다.
print("Vector A: {}".format(A))
print("Vector B: {}".format(B))

# 코사인 유사도를 계산합니다.
cosine = 1 - spatial.distance.cosine(A, B)

# 코사인 유사도를 출력합니다.
print("Cosine Similarity between A and B: {}".format(cosine))


Vector A: [2 3 5 2 6 7 9 2 3 4]
Vector B: [3 6 3 1 0 9 2 3 4 5]
Cosine Similarity between A and B: 0.753995943159304


#### 사이킷 런을 이용한 코사인 유사도 출력

In [53]:
# 필요 라이브러리를 불러옵니다.
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 두 벡터를 정의합니다.
A = np.array([2,3,5,2,6,7,9,2,3,4])
B = np.array([3,6,3,1,0,9,2,3,4,5])

# 벡터들을 출력합니다.
print("Vector A: {}".format(A))
print("Vector B: {}".format(B))

# 코사인 유사도를 계산합니다.
cosine = cosine_similarity([A],[B])

# 코사인 유사도를 출력합니다.
print("Cosine Similarity: {}".format(cosine[0][0]))


Vector A: [2 3 5 2 6 7 9 2 3 4]
Vector B: [3 6 3 1 0 9 2 3 4 5]
Cosine Similarity: 0.7539959431593041
