# Open AI Embeddings

In [1]:
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [3]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x13572bf10>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x139a73150>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [5]:
# Single Text Embeddings
single_text="Langchain and RAG amazing frameworks and projects to work on"
single_embeddings=embeddings.embed_query(single_text)
print(f"Length of embeddings: {len(single_embeddings)}")

Length of embeddings: 1536


In [6]:
print("Single Text Embeddings:\n")
print(f"Input Text: {single_text}")
print(f"Output vector length: {len(single_embeddings)} dimensions")
print(f"Sample output embeddings: {single_embeddings[:5]}")


Single Text Embeddings:

Input Text: Langchain and RAG amazing frameworks and projects to work on
Output vector length: 1536 dimensions
Sample output embeddings: [-0.0456421822309494, -0.022558461874723434, 0.01890929788351059, 0.0020543818827718496, 0.0226275734603405]


In [8]:
multiple_texts = [
    'Python is a programming language',
    'Langchain is a framework of LLM applications',
    'Embeddings convert text to vectors',
    'Langchain is a framework of LLM applications',
    'Vectors can be compared for similarity'
]
multiple_embeddings = embeddings.embed_documents(multiple_texts)
print("Multiple text embeddings:\n")
print(f"No of documents: {len(multiple_embeddings)}")
for i in range(len(multiple_embeddings)):
    print(f"Input text: {multiple_texts[i]}")
    print(f"Length of embedded vector: {len(multiple_embeddings[i])} dimensions")
    print(f"Output embedded vector sample: {multiple_embeddings[i][:3]}")

Multiple text embeddings:

No of documents: 5
Input text: Python is a programming language
Length of embedded vector: 1536 dimensions
Output embedded vector sample: [-0.011004673317074776, -0.020408110693097115, 0.018817074596881866]
Input text: Langchain is a framework of LLM applications
Length of embedded vector: 1536 dimensions
Output embedded vector sample: [-0.04455633834004402, -0.022108323872089386, 0.049057263880968094]
Input text: Embeddings convert text to vectors
Length of embedded vector: 1536 dimensions
Output embedded vector sample: [-0.027083737775683403, 0.01129983738064766, 0.011602511629462242]
Input text: Langchain is a framework of LLM applications
Length of embedded vector: 1536 dimensions
Output embedded vector sample: [-0.04455633834004402, -0.022108323872089386, 0.049057263880968094]
Input text: Vectors can be compared for similarity
Length of embedded vector: 1536 dimensions
Output embedded vector sample: [-0.036534711718559265, -0.020247383043169975, -0.04649

## Cosine Similarity with OpenAI Embeddings

In [9]:
# Example 1: Finding similar sentences

sentences = [
    "The cat sat on the mat",
    "The feline rested on the rug",
    "They dog played on the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

In [10]:
import numpy as np

In [11]:
def cosine_similarity(vec1, vec2):
    """ 
    Cosine similarity measures angle between two vectors.
    - Results closer to 1: Very similar
    - Results closer to 0: Not similar
    - Results close to -1: Opposite meaning
    """

    dot_product = np.dot(vec1, vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)

    return dot_product/(norm_a * norm_b)

In [12]:
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x13afed390>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x13b9205d0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [13]:
sentence_embeddings = embeddings.embed_documents(sentences)
sentence_embeddings

[[-0.03074316307902336,
  -0.04954070597887039,
  -0.005032286047935486,
  -0.0014980505220592022,
  0.036250557750463486,
  -0.0020749696996062994,
  -0.008868717588484287,
  0.027200847864151,
  0.007110487669706345,
  -0.011906835250556469,
  0.04160281643271446,
  -0.0013857370940968394,
  0.0451192781329155,
  0.05274689197540283,
  0.03206183388829231,
  0.03244968131184578,
  -0.012417497113347054,
  0.003046197583898902,
  -0.06603703647851944,
  0.047446344047784805,
  0.025869246572256088,
  -0.04540369659662247,
  -0.003451818600296974,
  0.014621748588979244,
  0.009101424366235733,
  0.01482859905809164,
  -0.011208713985979557,
  -0.012049044482409954,
  0.010762692429125309,
  0.01282473374158144,
  0.012288215570151806,
  -0.036069564521312714,
  -0.02650272659957409,
  -0.04535198211669922,
  -0.034595753997564316,
  0.00479957927018404,
  -0.019844723865389824,
  -0.011745233088731766,
  -0.0420682318508625,
  -0.02290869876742363,
  -0.03637984022498131,
  -0.0050581

In [14]:
## Calculate the similarity between all the pairs

for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        similarity=cosine_similarity(sentence_embeddings[i],sentence_embeddings[j])

        print(f"{sentences[i]} vs {sentences[j]}")
        print(f"Similarity: {similarity:.3f}\n")

The cat sat on the mat vs The feline rested on the rug
Similarity: 0.670

The cat sat on the mat vs They dog played on the yard
Similarity: 0.360

The cat sat on the mat vs I love programming in Python
Similarity: 0.090

The cat sat on the mat vs Python is my favorite programming language
Similarity: 0.120

The feline rested on the rug vs They dog played on the yard
Similarity: 0.346

The feline rested on the rug vs I love programming in Python
Similarity: 0.049

The feline rested on the rug vs Python is my favorite programming language
Similarity: 0.093

They dog played on the yard vs I love programming in Python
Similarity: 0.061

They dog played on the yard vs Python is my favorite programming language
Similarity: 0.067

I love programming in Python vs Python is my favorite programming language
Similarity: 0.708



In [15]:
## Example 2: Semantic search - Retrieve similar sentences

documents = [
    "Langchain is a framework for developing applications powered by language models.",
    "Python is a high-level programming language.",
    "Machine learning is a subset of Artificial Intelligence",
    "Embeddings convert text into numerical vectors.",
    "The weather today is sunny and warm"
]
query = "What is langchain?"

In [16]:
def semantic_search(query, documents, embedding_models, top_k=3):
    """ Simple Semantic search implementation. """

    ## Embed query and the documents
    query_embeddings=embedding_models.embed_query(query)
    document_embeddings=embedding_models.embed_documents(documents)

    ## Calculate the similarity score
    
    similarities=[]
    for i, doc_emb in enumerate(document_embeddings):
        similarity=cosine_similarity(query_embeddings, doc_emb)
        similarities.append((similarity,documents[i]))
    
    # Sort by similarity
    similarities.sort(reverse=True)
    return similarities[:top_k]


In [23]:
query="What is Embeddings?"

In [24]:
results=semantic_search(query=query,documents=documents,embedding_models=embeddings, top_k=3)
print(f"Results of the semantic search by order of similarity:\n\n")
print(f"Input Query: {query}\n")
print(f"Rank 1: \nSimilarity_score: {results[0][0]}\nSimilar document:{results[0][1]}\n")
print(f"Rank 2: \nSimilarity_score: {results[1][0]}\nSimilar document:{results[1][1]}\n")
print(f"Rank 3: \nSimilarity_score: {results[2][0]}\nSimilar document:{results[2][1]}\n")

Results of the semantic search by order of similarity:


Input Query: What is Embeddings?

Rank 1: 
Similarity_score: 0.6194201209524666
Similar document:Embeddings convert text into numerical vectors.

Rank 2: 
Similarity_score: 0.24521473062509536
Similar document:Machine learning is a subset of Artificial Intelligence

Rank 3: 
Similarity_score: 0.229172520496685
Similar document:Langchain is a framework for developing applications powered by language models.

