### Embedding Dataset for Similarity Searches

In [None]:
pip install pandas numpy -q

In [None]:
import os 
import openai

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)
openai.api_key = os.getenv('OPENAI_API_KEY')
from openai import OpenAI

In [None]:
import pandas as pd
df = pd.read_csv('words.csv')
df = df.sample(frac=1)
df

In [None]:
def get_embedding(text, model = 'text-embedding-3-small'):
    client = OpenAI()
    text = text.replace('\n',' ')
    
    response = client.embeddings.create(
        input = text,
        model = model
    )
    return response.data[0].embedding

In [None]:
df['embedding'] = df['text'].apply(lambda x: get_embedding(x))

In [None]:
df

In [None]:
df.to_csv('words-embedding.csv', index = False)

### Estimating Embedding Costs with tiktoken

In [None]:
pip install tiktoken -q

In [None]:
import tiktoken
import pandas as pd

df = pd.read_csv('words-embedding.csv')

In [None]:
words = list(df['text'])
enc = tiktoken.encoding_for_model('text-embedding-3-small')
total_tokens = sum([len(enc.encode(word)) for word in words])
print(f'Total Tokens:  {total_tokens}')

In [None]:
cost_per_token = 0.02 / 1_000_000
estimated_cost = total_tokens * cost_per_token
print(f'Estimated cost in USD: {estimated_cost:.10f}')

### Performing Semantic Searches

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('words-embedding.csv')


In [None]:
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

In [None]:
search_term = 'strider'
search_term_vector = get_embedding(search_term)

In [None]:
import numpy as np 

def cosine_similarity(vector_x, vector_y):
    # Convert inputs to numpy arrays
    x = np.array(vector_x)
    y = np.array(vector_y)
    
    # Ensure the vectors are one-dimensional
    if x.ndim != 1 or y.ndim != 1:
        raise ValueError("Vectors must be one-dimensional.")
    
    # Check if vectors have the same dimensions
    if x.shape[0] != y.shape[0]:
        raise ValueError("Vectors must be of the same dimensions")
    
    # Compute the dot product of the two vectors
    dot_product = np.dot(x,y)
    
    # Compute the norm (magnitude) of each vector
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)
    
    # Check for zero vectors to avoid division by zero
    if norm_x == 0 or norm_y == 0:
        raise ValueError("One of the vectors is zero; cosine similarity is not defined.")
    
    # Compute cosine similarity
    similarity = dot_product / (norm_x * norm_y)
    
    return similarity

In [None]:
df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity (x, search_term_vector))
df.sort_values('similarities', ascending=False).head(10)

In [None]:
v1 = df['embedding'].iloc[7]
v2 = df['embedding'].iloc[10]
v = v1 + v2

df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x,v))
df.sort_values('similarities', ascending=False).head(10)