# Embeddings demo for the Data & AI Training day, November 2024

# Import libraries, mount google drive, cofnigure the api key

In [61]:
# import libraries
# for loading the parameters:
import os
from dotenv import load_dotenv

# for parsing API requests and responses:
import json

# for vector similarity metrics:
import numpy as np
import scipy

# for printing tables:
from tabulate import tabulate
# for HTML formatting in print():
from IPython.display import display, HTML

In [20]:
#############################################################################################################################
# Note: Because this script was written before openai 1.0.0, use the openai library 0.28.1.                                 #
#       Alternatively, use openai migrate script to update the code to use hte latest OpenAI library                        #
#        in which case refer to https://github.com/openai/openai-python/discussions/742 for the code upgrade instructions   #
#############################################################################################################################
%pip install openai==0.28.1 -q 
import openai
print (f'Openai version:{openai.__version__}')

Note: you may need to restart the kernel to use updated packages.
Openai version:0.28.1


In [72]:
# Load the environment variables from .env file in the notebook home folder and assigne the environment variables
# in this case .env file contains the key generated at https://platform.openai.com/api-keys as a single row OPENAI_API_KEY=sk-...
dotenv_path = r'C:\Users\sergey.lyubarskiy\OneDrive - Accenture\jupyter_notebooks'
load_dotenv(dotenv_path)

# Use os.environ to access environment variables
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

# Eye-ball if the key seem to have loaded correctly
print(f'OpenAI API Key loaded: {OPENAI_API_KEY[:7]}...{OPENAI_API_KEY[-4:]}')  

# Now initialise openai library with your key
openai.api_key = OPENAI_API_KEY

OpenAI API Key loaded: sk-9e80...Oz50


# Create embedding and calculate the distance between them

In [37]:
# Define input texts and get their embeddings

input_texts = ['Cat is chasing a mouse','Kitten is catching rodents', 'Cats purr when happy']
model_name = 'text-embedding-ada-002'

# Call OpenAI to get embeddings for the input list of text strings

# Docs: https://platform.openai.com/docs/api-reference/embeddings/create
embeddings = openai.Embedding.create(input=input_texts, model=model_name)

# Output object will have the keys 'object', 'data', 'model', 'usage', where the len(data) = len(input)

# check the tokens used by the embeddings creation request
print(f'tokens usage: {embeddings.usage}')

# demonstrate that check the embeddings length is 1536
print('Note a list of 3 objectcts returned, each containing a 1536-dimentional vector (as a list of float) ')
for i in range(len(embeddings.data)):
    # print first 3 and last 3 values of 1536 in each embedding
    print(f'Embedding {i}: {embeddings.data[i].embedding[:3]} ... ({len(embeddings.data[i].embedding)} values) ... {embeddings.data[i].embedding[-3:]}')


tokens usage: {
  "prompt_tokens": 16,
  "total_tokens": 16
}
Note a list of 3 objectcts returned, each containing a 1536-dimentional vector (as a list of float) 
Embedding 0: [-0.01904132030904293, 0.003049201099202037, -0.008756048046052456] ... (1536 values) ... [-0.03105313889682293, -0.00011234101839363575, -0.016920138150453568]
Embedding 1: [-0.003592884400859475, 0.014437522739171982, 0.008472477085888386] ... (1536 values) ... [-0.013434549793601036, 0.0011926792794838548, -0.019096065312623978]
Embedding 2: [0.012431960552930832, -0.00036493566585704684, 0.015292245894670486] ... (1536 values) ... [-0.0022573822643607855, 0.013759282417595387, -0.014868499711155891]


In [60]:
# For illustration print cosine and euclidean distances between embeddings.
# note that distane to itself is minimum for identical sentence
# note that either metric finds the closest sentences are:
#    'Cat is chasing a mouse' and 'Kitten is catching rodents' 

# Calculate distances and store in a list
table_data = []
for i in range(len(embeddings.data)):
    for j in range(i, len(embeddings.data)):
        cosine_distance = scipy.spatial.distance.cosine(embeddings.data[i].embedding, embeddings.data[j].embedding)
        euclidean_distance = scipy.spatial.distance.euclidean(embeddings.data[i].embedding, embeddings.data[j].embedding)
        table_data.append([input_texts[i], input_texts[j], f"{cosine_distance:.4f}", f"{euclidean_distance:.4f}"])


# pring, using HTML tags <B> for bold and <BR> for new line
display(HTML('''
The cosine similarity measures the cosine of the angle between two vectors, yielding a value between -1 and 1, where <B> 1 indicates identical vectors</B>. 
<BR>However <B>scipy</B>.spatial.distance.cosine function <B>outputs = 1 - cosine_similarity</B>.
<BR>Thus, with <B>scipy</B> the cosine distance ranges from 0 to 2, where <B>0 indicates identical</B> vectors. 
<BR>This means, for identical vectors, the scipy.spatial.distance.cosine function will return 0, indicating no distance (or perfect similarity).'''
))
# Print as a table with headers
headers = ["Text 1", "Text 2", "Cosine Distance", "Euclidean Distance"]
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+----------------------------+----------------------------+-------------------+----------------------+
| Text 1                     | Text 2                     |   Cosine Distance |   Euclidean Distance |
| Cat is chasing a mouse     | Cat is chasing a mouse     |            0      |               0      |
+----------------------------+----------------------------+-------------------+----------------------+
| Cat is chasing a mouse     | Kitten is catching rodents |            0.0754 |               0.3884 |
+----------------------------+----------------------------+-------------------+----------------------+
| Cat is chasing a mouse     | Cats purr when happy       |            0.1518 |               0.551  |
+----------------------------+----------------------------+-------------------+----------------------+
| Kitten is catching rodents | Kitten is catching rodents |            0      |               0      |
+----------------------------+----------------------------+--------------

# Create and search a vector database

### Generate embeddings, initialise in-memory vector db and load the sentences,  initialise the index and load embeddings into the inedex 

In [41]:
# Sometimes faiss fails, but impirically found that installing mkl before faiss 
%pip install mkl -q
%pip install faiss-cpu -q
# import mkl # importing mkl is not required, just installing it before faiss may help to avoid errors
import faiss

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [42]:
# Now let's load the embeddings into a vector databse and search for the "neighbors" of the embedding included in the  search query
# Generate embeddings (again to for illustrative purposes, so that the whole loading is self-contained)
input_texts = ['Cat is chasing a mouse','Kitten is catching rodents', 'Cats purr when happy']
model_name = 'text-embedding-ada-002'

# Get embeddings for a list of text strings
embeddings = openai.Embedding.create(input=input_texts, model=model_name)

# create a numpy array of embeddings
embeddings_list = []

for emb in embeddings.data:
  embeddings_list.append(emb.embedding)
embeddings_np = np.array(embeddings_list)


# Create a FAISS index for cosine similarity
cossim_idx = faiss.IndexFlatIP(1536)
cossim_idx.add (embeddings_np)

In [65]:
headers = ["Query Text", "Database Index", "Database Text", "Cosine Distance"]

# create a list of queries
query_texts = ['Cat is chasing a mouse', 'Dogs bark when angry']

def get_embedding(input_text, model_name):
    """Get embedding for a single text using a specified model."""
    return openai.Embedding.create(input=input_text, model=model_name).data[0].embedding

def search_in_index(index, query_embedding):
    """Search a query embedding in the FAISS index."""
    query_emb_arr = np.array(query_embedding).reshape(1, -1)
    return index.search(query_emb_arr, 3)

# Iteratively search database for each query and print the results
display(HTML('''
<P>The <B> cosine </B> similarity measures the cosine of the angle between two vectors, yielding a value between -1 and 1, where <B> 1 indicates identical vectors</B>. 
<BR> <B>However </B>, FAISS's index.search function, when using a cosine similarity metric, actually returns the similarity score, not the distance. 
<BR> In many implementations, including FAISS, the similarity score is often scaled or shifted so that higher values indicate more similarity. 
<BR> Specifically, FAISS computes the inner product (not exactly the cosine similarity) when configured for maximum inner product search (MIPS). 
<BR> Threfore, with <B>FAISS perfect similarity = 1 or higher </B> (depending on the scaling)
'''
))
for query_text in query_texts:
    # EMBED the query
    query_embedding = get_embedding(query_text, model_name) # get the query's embedding for OpenAI

    # SEARCH database for the query string, returning the matches and the corresponding distance 
    similarities, indices = search_in_index(cossim_idx, query_embedding) #search FAISS index for query's embedding

    # PRINT search results as a table
    table_data = []
    for i in range(len(indices[0])):
        table_data.append([query_text, f"{indices[0][i]:.4f}", input_texts[i], f"{similarities[0][i]:.4f}"])
    print(tabulate(table_data, headers=headers, tablefmt="grid"))

+------------------------+------------------+----------------------------+-------------------+
| Query Text             |   Database Index | Database Text              |   Cosine Distance |
| Cat is chasing a mouse |                0 | Cat is chasing a mouse     |            1      |
+------------------------+------------------+----------------------------+-------------------+
| Cat is chasing a mouse |                1 | Kitten is catching rodents |            0.9246 |
+------------------------+------------------+----------------------------+-------------------+
| Cat is chasing a mouse |                2 | Cats purr when happy       |            0.8482 |
+------------------------+------------------+----------------------------+-------------------+
+----------------------+------------------+----------------------------+-------------------+
| Query Text           |   Database Index | Database Text              |   Cosine Distance |
| Dogs bark when angry |                2 | Cat is cha

# Backup

In [66]:
# placeholder