### Embedding Dataset for Similarity Searches

In [21]:
pip install pandas numpy -q

Note: you may need to restart the kernel to use updated packages.


In [22]:
import os 
import openai

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)
openai.api_key = os.getenv('OPENAI_API_KEY')
from openai import OpenAI

In [23]:
import pandas as pd
df = pd.read_csv('words.csv')
df = df.sample(frac=1)
df

Unnamed: 0,text
2,Of the Ordering of the Shire
1,Concerning Pipeweed
24,The Great River
13,Strider
12,At the Sign of the Prancing Pony
25,The Breaking of the Fellowship
0,Concerning Hobbits
23,Farewell to Lórien
10,In the House of Tom Bombadil
4,A Long-expected Party


In [24]:
def get_embedding(text, model = 'text-embedding-3-small'):
    client = OpenAI()
    text = text.replace('\n',' ')
    
    response = client.embeddings.create(
        input = text,
        model = model
    )
    return response.data[0].embedding

In [25]:
df['embedding'] = df['text'].apply(lambda x: get_embedding(x))

In [26]:
df

Unnamed: 0,text,embedding
2,Of the Ordering of the Shire,"[-0.001108536496758461, -0.0020498200319707394..."
1,Concerning Pipeweed,"[0.04896913468837738, -0.014637514017522335, 0..."
24,The Great River,"[0.032057587057352066, 0.02481781505048275, 0...."
13,Strider,"[0.02960376814007759, -0.004861738532781601, -..."
12,At the Sign of the Prancing Pony,"[-0.00913687888532877, 0.02207891084253788, 0...."
25,The Breaking of the Fellowship,"[0.0021339815575629473, 0.021011510863900185, ..."
0,Concerning Hobbits,"[-0.0024255753960460424, -0.003355183871462941..."
23,Farewell to Lórien,"[0.0322844572365284, -0.027852751314640045, -0..."
10,In the House of Tom Bombadil,"[0.03586048632860184, -0.008986899629235268, -..."
4,A Long-expected Party,"[0.034196652472019196, -0.021441977471113205, ..."


In [27]:
df.to_csv('words-embedding.csv', index = False)

### Estimating Embedding Costs with tiktoken

In [31]:
pip install tiktoken -q

Note: you may need to restart the kernel to use updated packages.


In [32]:
import tiktoken
import pandas as pd

df = pd.read_csv('words-embedding.csv')

In [35]:
words = list(df['text'])
enc = tiktoken.encoding_for_model('text-embedding-3-small')
total_tokens = sum([len(enc.encode(word)) for word in words])
print(f'Total Tokens:  {total_tokens}')

Total Tokens:  135


In [36]:
cost_per_token = 0.02 / 1_000_000
estimated_cost = total_tokens * cost_per_token
print(f'Estimated cost in USD: {estimated_cost:.10f}')

Estimated cost in USD: 0.0000027000


### Performing Semantic Searches

In [41]:
import pandas as pd
import numpy as np

df = pd.read_csv('words-embedding.csv')


In [43]:
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

In [46]:
search_term = 'strider'
search_term_vector = get_embedding(search_term)

In [47]:
import numpy as np 

def cosine_similarity(vector_x, vector_y):
    # Convert inputs to numpy arrays
    x = np.array(vector_x)
    y = np.array(vector_y)
    
    # Ensure the vectors are one-dimensional
    if x.ndim != 1 or y.ndim != 1:
        raise ValueError("Vectors must be one-dimensional.")
    
    # Check if vectors have the same dimensions
    if x.shape[0] != y.shape[0]:
        raise ValueError("Vectors must be of the same dimensions")
    
    # Compute the dot product of the two vectors
    dot_product = np.dot(x,y)
    
    # Compute the norm (magnitude) of each vector
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    
    # Check for zero vectors to avoid division by zero
    if norm_x == 0 or norm_y == 0:
        raise ValueError("One of the vectors is zero; cosine similarity is not defined.")
    
    # Compute cosine similarity
    similarity = dot_product / (norm_x * norm_y)
    
    return similarity

In [48]:
df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity (x, search_term_vector))
df.sort_values('similarities', ascending=False).head(10)

Unnamed: 0,text,embedding,similarities
3,Strider,"[0.02960376814007759, -0.004861738532781601, -...",0.894663
21,Flight to the Ford,"[-0.03537376597523689, -0.018602756783366203, ...",0.255203
4,At the Sign of the Prancing Pony,"[-0.00913687888532877, 0.02207891084253788, 0....",0.253852
10,The Bridge of Khazad-dûm,"[0.011364811100065708, 0.03440088778734207, 0....",0.225724
6,Concerning Hobbits,"[-0.0024255753960460424, -0.003355183871462941...",0.216443
18,The Old Forest,"[0.005150707438588142, 0.0065145897679030895, ...",0.213756
19,The Shadow of the Past,"[0.01627415604889393, -0.002935184631496668, -...",0.213362
24,Fog on the Barrow-downs,"[-0.012674693949520588, -0.0029922001995146275...",0.210501
22,Lothlórien,"[-0.004141297657042742, -0.009118582122027874,...",0.207601
7,Farewell to Lórien,"[0.0322844572365284, -0.027852751314640045, -0...",0.207517


In [49]:
v1 = df['embedding'].iloc[7]
v2 = df['embedding'].iloc[10]
v = v1 + v2

df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x,v))
df.sort_values('similarities', ascending=False).head(10)

Unnamed: 0,text,embedding,similarities
10,The Bridge of Khazad-dûm,"[0.011364811100065708, 0.03440088778734207, 0....",0.825474
7,Farewell to Lórien,"[0.0322844572365284, -0.027852751314640045, -0...",0.825474
22,Lothlórien,"[-0.004141297657042742, -0.009118582122027874,...",0.628402
5,The Breaking of the Fellowship,"[0.0021339815575629473, 0.021011510863900185, ...",0.561789
11,The Mirror of Galadriel,"[0.0217824075371027, -0.0005199048900976777, -...",0.543882
6,Concerning Hobbits,"[-0.0024255753960460424, -0.003355183871462941...",0.537397
25,The Council of Elrond,"[0.018467994406819344, -0.03211245685815811, 0...",0.530359
8,In the House of Tom Bombadil,"[0.03586048632860184, -0.008986899629235268, -...",0.507728
18,The Old Forest,"[0.005150707438588142, 0.0065145897679030895, ...",0.438066
17,A Journey in the Dark,"[-0.017823899164795876, 0.006862201262265444, ...",0.416773
