In [1]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
import os
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


# Embedding models


### OpenAI Embedding Models

OpenAI offers two powerful third-generation embedding model (denoted by -3 in the model ID). Read the embedding v3 announcement blog post for more details.

Usage is priced per input token. Below is an example of pricing pages of text per US dollar (assuming ~800 tokens per page):
Model ~ Pages per dollar Performance on MTEB eval Max input
text-embedding-3-small 62,500 62.3% 8192
text-embedding-3-large 9,615 64.6% 8192
text-embedding-ada-002 12,500 61.0% 8192

> https://platform.openai.com/docs/guides/embeddings?lang=python


In [None]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [None]:
embeddings = OpenAIEmbeddings(
    openai_api_key=openai_api_key, model="text-embedding-3-large"
)

In [None]:
query = "What is the capital of France?"
vector = embeddings.embed_query(query)
print(f"Embedding vector for the query '{query}':\n{vector}")

- ##### embed_documents
- ##### embed_query


In [2]:
from langchain_community.document_loaders import TextLoader

textloader = TextLoader("../1.data-ingestion/speech.txt")
speech = textloader.load()

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=15)

speech_chunks = splitter.split_documents(speech)
print(f"Number of chunks created: {len(speech_chunks)}")
speech_chunks

Number of chunks created: 23


[Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='Just because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, cond

# Vector Store and Embedding at Once


In [None]:
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(speech_chunks, embeddings)

In [None]:
retrieved_docs = db.similarity_search("the sincere friends of the German people")
pprint(retrieved_docs)

# Ollama Embeddings

> https://ollama.com/blog/embedding-models


In [19]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

In [20]:
ollama_embeddings = OllamaEmbeddings(model="nomic-embed-text")
# ollama_embeddings = OllamaEmbeddings(model="gemma3:4b")

In [21]:
documents = ollama_embeddings.embed_documents(
    [
        "Alpha is the first letter of Greek alphabet",
        "Beta is the second letter of Greek alphabet",
    ]
)

In [22]:
documents

[[-0.022565603,
  0.013706392,
  -0.16006948,
  -0.04226057,
  0.07940034,
  -0.012049906,
  0.0036889291,
  -0.0001413232,
  0.011717896,
  -0.03169196,
  -0.022714665,
  0.085250705,
  0.07700687,
  0.04075436,
  0.017393699,
  -0.049718026,
  0.025569499,
  -0.05285225,
  -0.042207878,
  0.015210288,
  -0.028468713,
  0.04482908,
  -0.029748207,
  -0.03535661,
  0.09837465,
  -0.016478783,
  0.036996312,
  -0.0003552263,
  -0.038904846,
  -5.7416684e-05,
  -0.0059576617,
  -0.025867267,
  -0.0033541922,
  0.005555574,
  -0.0052006035,
  -0.0058970996,
  0.021039085,
  0.03755371,
  0.04935061,
  -0.010333177,
  -0.0076622674,
  0.010094551,
  0.01737937,
  0.00061274227,
  0.057925794,
  0.0035323934,
  -0.040377636,
  0.058662217,
  0.09804944,
  0.0064077172,
  0.037057437,
  0.04168009,
  -0.016116982,
  -0.005298872,
  0.0007862489,
  0.10316209,
  0.014886493,
  0.0015968962,
  -0.02499082,
  0.06041779,
  0.019326279,
  0.08412897,
  -0.07810975,
  0.041015893,
  0.027747309,


In [23]:
len(documents[0])

768

In [27]:
query_results = ollama_embeddings.embed_query(
    "saya adalah anak gembala yang selalu riang serta gembira karena aku senang bekerja dan makan tak pernah sedikit"
)

In [28]:
query_results

[0.038389686,
 0.0042421464,
 -0.13815169,
 0.005463201,
 -0.0064131464,
 0.007269087,
 -0.0036316605,
 -0.025000041,
 -0.011193109,
 0.0027334746,
 -0.012852426,
 -0.00089877064,
 0.078378476,
 0.03386163,
 -0.02429058,
 -0.032257643,
 0.012972984,
 -0.07024034,
 -0.081041396,
 0.07162748,
 -0.05529839,
 0.019793026,
 -0.021556864,
 -0.023639454,
 0.08441606,
 0.0037857427,
 0.03203691,
 0.07412808,
 -0.023439761,
 -0.050512075,
 0.0007372824,
 -0.061064444,
 -0.01860668,
 0.0020025142,
 0.016469302,
 -0.046845898,
 0.079789326,
 0.036880806,
 0.025863053,
 -0.04271145,
 -0.015740832,
 -0.035736904,
 -6.0476377e-05,
 -0.024524841,
 0.016809296,
 -0.019997988,
 0.080103695,
 0.07109882,
 0.050101608,
 0.00049977645,
 -0.033925142,
 -0.021421501,
 0.028117158,
 -0.058091756,
 0.028696753,
 0.065874785,
 0.032130364,
 0.00021451691,
 -0.01978778,
 0.009981377,
 0.043685786,
 0.045634415,
 -0.08425921,
 0.056090545,
 0.02513051,
 -0.0534022,
 0.010980568,
 -0.00851135,
 0.02515549,
 -0.03

In [30]:
len(query_results)

768

In [31]:
db_ollama = Chroma.from_documents(speech_chunks, ollama_embeddings)

In [34]:
ollama_retrieved = db_ollama.similarity_search(
    "the sincere friends of the German people"
)
pprint(ollama_retrieved)

[Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='amuck. We are, let me say again, the sincere friends of the German people, and shall desire nothing so much as the early reestablishment of intimate relations of mutual advantage between us—however'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='We shall, happily, still have an opportunity to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='…'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='true and loyal Americans as if they had never known any other fealty or allegiance. They will be prompt to stand with us in rebuking and restraining the few who may be of a different mind and')]


# HugginhgFace Embeddings


In [35]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

In [36]:
teks = "This is a sample text for generating embeddings."
dokumen = ["This is the first document.", "This is the second document."]

vector = embeddings.embed_query(teks)
print(len(vector))
print(f"Embedding vector for the text '{teks}':\n{vector}")

print("\n")
vectors = embeddings.embed_documents(dokumen)
print(len(vectors))
print(f"Embedding vectors for the documents '{dokumen}':\n{vectors}")

384
Embedding vector for the text 'This is a sample text for generating embeddings.':
[-0.040227100253105164, -0.019801940768957138, 0.0422239676117897, 0.02319173701107502, 0.03999132290482521, 0.05057329684495926, 0.001347951008938253, -0.002485523698851466, 0.023616671562194824, -0.03181460127234459, 0.04465523362159729, 0.0017667444190010428, 0.05736272409558296, -0.03516148403286934, -0.04150361567735672, 0.07837842404842377, 0.09517353773117065, 0.004269008059054613, -0.0053900303319096565, -0.025717392563819885, 0.04107312485575676, 0.0670120120048523, 0.09535247832536697, -0.055133841931819916, 0.013838634826242924, -0.04405195638537407, -0.027230676263570786, 0.09569345414638519, 0.11855163425207138, 0.003672579536214471, 0.06449505686759949, 0.011702905409038067, 0.053681936115026474, 0.029742984101176262, 0.058643072843551636, 0.07695060223340988, -0.020074279978871346, 0.08657673746347427, 0.014159399084746838, 0.062282219529151917, 0.016732867807149887, -0.0230777468532323