In [21]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
import os
from pprint import pprint

# Embedding models


### OpenAI Embedding Models

OpenAI offers two powerful third-generation embedding model (denoted by -3 in the model ID). Read the embedding v3 announcement blog post for more details.

Usage is priced per input token. Below is an example of pricing pages of text per US dollar (assuming ~800 tokens per page):
Model ~ Pages per dollar Performance on MTEB eval Max input
text-embedding-3-small 62,500 62.3% 8192
text-embedding-3-large 9,615 64.6% 8192
text-embedding-ada-002 12,500 61.0% 8192

> https://platform.openai.com/docs/guides/embeddings?lang=python


In [2]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

In [3]:
embeddings = OpenAIEmbeddings(
    openai_api_key=openai_api_key, model="text-embedding-3-large"
)

In [7]:
query = "What is the capital of France?"
vector = embeddings.embed_query(query)
print(f"Embedding vector for the query '{query}':\n{vector}")

Embedding vector for the query 'What is the capital of France?':
[-0.05558660626411438, 0.04085923358798027, -0.012091625481843948, 0.04258221387863159, -0.056530144065618515, -0.027403581887483597, 0.022952551022171974, -0.011599346064031124, -0.02693181484937668, -0.008620026521384716, -0.0026767721865326166, 0.00039484951412305236, -0.02643953450024128, -0.018265636637806892, 0.01230699848383665, 0.0007679053815081716, -0.0052971369586884975, 0.016583679243922234, -2.0691941244876944e-05, 0.007727769203484058, 0.021044965833425522, -0.008409782312810421, -0.019434800371527672, -0.03903369605541229, 0.0003938880399800837, 0.008927701972424984, 0.038541413843631744, -0.00912256259471178, 0.035362109541893005, 0.028695818036794662, 0.07351379841566086, 0.00417155958712101, 0.01736312359571457, -0.01743491366505623, -0.0444282628595829, -0.004476670641452074, 0.015065817162394524, 0.015301700681447983, 0.025208834558725357, 0.0065842438489198685, 0.009778935462236404, -0.012030090205371

- ##### embed_documents
- ##### embed_query


In [4]:
from langchain_community.document_loaders import TextLoader

textloader = TextLoader("../1.data-ingestion/speech.txt")
speech = textloader.load()

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=15)

speech_chunks = splitter.split_documents(speech)
print(f"Number of chunks created: {len(speech_chunks)}")
speech_chunks

Number of chunks created: 23


[Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='Just because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, cond

# Vector Store and Embedding at Once


In [16]:
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(speech_chunks, embeddings)

In [22]:
retrieved_docs = db.similarity_search("the sincere friends of the German people")
pprint(retrieved_docs)

[Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='amuck. We are, let me say again, the sincere friends of the German people, and shall desire nothing so much as the early reestablishment of intimate relations of mutual advantage between us—however'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='We shall, happily, still have an opportunity to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='true and loyal Americans as if they had never known any other fealty or allegiance. They will be prompt to stand with us in rebuking and restraining the few who may be of a different mind and'),
 Document(metadata={'source': '../1.data-ingestion/speech.txt'}, page_content='among us and share our life, and we shall be proud to prove it toward all who are in f

# Ollama Embeddings

> https://ollama.com/blog/embedding-models


# HugginhgFace Embeddings
