In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [3]:
## import the library
from langchain_openai import OpenAIEmbeddings

embeddings=OpenAIEmbeddings(model="text-embedding-3-small")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
## Single text embeddings
single_text="Langchain and Rag are amazing frameworks and projects to work on"
single_embeddings=embeddings.embed_query(single_text)

print("📝 Single Text Embedding:")
print(len(single_embeddings))
print(single_embeddings)
print(f"Input: {single_text}")
print(f"Output: Vector of {len(single_embeddings)} dimensions")
print(f"Sample values: {single_embeddings[:5]}")

📝 Single Text Embedding:
1536
[-0.050007786601781845, -0.031081510707736015, -0.0034399048890918493, -0.003286944702267647, 0.03265869989991188, -0.03132624924182892, -0.014466634951531887, 0.001493061427026987, -0.01057804748415947, -0.033909574151039124, 0.018178468570113182, 0.004568411037325859, -0.03817886486649513, 0.05003497749567032, 0.005989241413772106, 0.01439865306019783, -8.14194354461506e-05, -0.06227179616689682, 0.04057184234261513, 0.06520862877368927, -0.001464169006794691, -0.006070820149034262, -0.018246451392769814, 0.03295782208442688, -0.006907002534717321, -0.008334631100296974, -0.00758002744987607, 0.06330512464046478, 0.01789294369518757, -0.019796447828412056, 8.497788803651929e-05, -0.033936768770217896, -0.006420928984880447, 0.033719226717948914, 0.014942510984838009, 0.021835917606949806, -0.00574450520798564, 0.004095933865755796, -0.008096693083643913, 0.022556530311703682, 0.009626294486224651, 0.03804289922118187, 0.004422249272465706, -0.00751884328

In [6]:
# Example 2: Multiple texts at once
multiple_texts = [
    "Python is a programming language",
    "LangChain is a framework for LLM applications",
    "Embeddings convert text to numbers",
    "Vectors can be compared for similarity"
]

In [9]:
multiple_embeddings = embeddings.embed_documents(multiple_texts)

print("\n📚 Multiple Text Embeddings:")
print(f"Number of texts: {len(multiple_texts)}")
print(f"Number of embeddings: {len(multiple_embeddings)}")
print(f"Each embedding size: {len(multiple_embeddings[0])}")


📚 Multiple Text Embeddings:
Number of texts: 4
Number of embeddings: 4
Each embedding size: 1536


In [11]:
# Different OpenAI embedding models

models_comparison = {
    "text-embedding-3-small": {
        "dimensions": 1536,
        "description": "Good balance of performance and cost",
        "cost_per_1m_tokens": 0.02,
        "use_case": "General purpose, cost-effective"
    },
    "text-embedding-3-large": {
        "dimensions": 3072,
        "description": "Highest quality embeddings",
        "cost_per_1m_tokens": 0.13,
        "use_case": "When accuracy is critical"
    },
    "text-embedding-ada-002": {
        "dimensions": 1536,
        "description": "Previous generation model",
        "cost_per_1m_tokens": 0.10,
        "use_case": "Legacy applications"
    }
}

# Display comparison
print("📊 OpenAI Embedding Models Comparison:\n")
for model_name, details in models_comparison.items():
    print(f"Model: {model_name}")
    print(f"  📏 Dimensions: {details['dimensions']}")
    print(f"  💰 Cost: ${details['cost_per_1m_tokens']}/1M tokens")
    print(f"  📝 Description: {details['description']}")
    print(f"  🎯 Use case: {details['use_case']}\n")

📊 OpenAI Embedding Models Comparison:

Model: text-embedding-3-small
  📏 Dimensions: 1536
  💰 Cost: $0.02/1M tokens
  📝 Description: Good balance of performance and cost
  🎯 Use case: General purpose, cost-effective

Model: text-embedding-3-large
  📏 Dimensions: 3072
  💰 Cost: $0.13/1M tokens
  📝 Description: Highest quality embeddings
  🎯 Use case: When accuracy is critical

Model: text-embedding-ada-002
  📏 Dimensions: 1536
  💰 Cost: $0.1/1M tokens
  📝 Description: Previous generation model
  🎯 Use case: Legacy applications



In [13]:
### Cosine Similarity With OpenAI Embeddings

import numpy as np
def cosine_similarity(vec1, vec2):
    """
    Cosine similarity measures the angle between two vectors.
    - Result close to 1: Very similar
    - Result close to 0: Not related
    - Result close to -1: Opposite meanings
    """

    dot_product=np.dot(vec1,vec2)
    norm_a=np.linalg.norm(vec1)
    norm_b=np.linalg.norm(vec2)
    return dot_product/(norm_a * norm_b)

In [14]:
# Example 1: Finding similar sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

In [15]:
sentence_embeddings=embeddings.embed_documents(sentences)

In [16]:
## Calculate the simialrity betwween all pairs

for i in range(len(sentences)):
    for j in range(i+1,len(sentences)):
        similarity=cosine_similarity(sentence_embeddings[i],sentence_embeddings[j])

        print(f"'{sentences[i]}' vs '{sentences[j]}'")
        print(f"Similarity: {similarity:.3f}\n")


'The cat sat on the mat' vs 'A feline rested on the rug'
Similarity: 0.655

'The cat sat on the mat' vs 'The dog played in the yard'
Similarity: 0.324

'The cat sat on the mat' vs 'I love programming in Python'
Similarity: 0.089

'The cat sat on the mat' vs 'Python is my favorite programming language'
Similarity: 0.120

'A feline rested on the rug' vs 'The dog played in the yard'
Similarity: 0.296

'A feline rested on the rug' vs 'I love programming in Python'
Similarity: 0.055

'A feline rested on the rug' vs 'Python is my favorite programming language'
Similarity: 0.103

'The dog played in the yard' vs 'I love programming in Python'
Similarity: 0.126

'The dog played in the yard' vs 'Python is my favorite programming language'
Similarity: 0.085

'I love programming in Python' vs 'Python is my favorite programming language'
Similarity: 0.708



In [17]:
### Example- Semantic Search- Retireve the similar sentence
def semantic_search(query,documents,embeddings_models,top_k=3):
    """Simple semantic search implementation"""

    ## embed query and doument

    query_embedding=embeddings_models.embed_query(query)
    doc_embeddings = embeddings_models.embed_documents(documents)

    ## Calculate the similarity score

    similarties=[]

    for i,doc_emb in enumerate(doc_embeddings):
        similarity=cosine_similarity(query_embedding,doc_emb)
        similarties.append((similarity,documents[i]))

    ## Sort by similarity
    similarties.sort(reverse=True)
    return similarties[:top_k]

In [18]:
# Test semantic search
documents = [
    "LangChain is a framework for developing applications powered by language models",
    "Python is a high-level programming language",
    "Machine learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]
query="What is Langchain?"

In [19]:
results=semantic_search(query,documents,embeddings)

In [20]:
print(f"\n🔎 Semantic Search Results for: '{query}'")
for score, doc in results:
    print(f"Score: {score:.3f} | {doc}")


🔎 Semantic Search Results for: 'What is Langchain?'
Score: 0.676 | LangChain is a framework for developing applications powered by language models
Score: 0.130 | Python is a high-level programming language
Score: 0.101 | Embeddings convert text into numerical vectors


In [21]:
query="What is Embeddings?"
results=semantic_search(query,documents,embeddings)
results

[(np.float64(0.6227387139613365),
  'Embeddings convert text into numerical vectors'),
 (np.float64(0.25206899523723963),
  'Machine learning is a subset of artificial intelligence'),
 (np.float64(0.2291701911027054),
  'LangChain is a framework for developing applications powered by language models')]

| **Aspect**               | **Semantic Search**                                                      | **Cosine Similarity**                                |
| ------------------------ | ------------------------------------------------------------------------ | ---------------------------------------------------- |
| **Definition**           | Method to **find meaning-based matches**                                 | Formula to **measure closeness** between two vectors |
| **Scope**                | Broad concept involving embeddings, ranking, context understanding       | Narrow concept, just a similarity metric             |
| **Uses Embeddings?**     | ✅ Yes, always                                                            | ✅ Works **on** embeddings                            |
| **Understands Meaning?** | ✅ Yes (via embeddings)                                                   | ❌ No, it only compares numbers                       |
| **Output**               | A **list of ranked documents**                                           | A **single numeric score**                           |
| **Example**              | “Best phones 2025” → fetches articles about “Top Androids” & “iPhone 16” | Returns **0.95** similarity score between vectors    |
