In [None]:
# ✅ 1. Load Environment Variables (like API keys)
import os
from dotenv import load_dotenv
load_dotenv() # load all the environment variables

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

# 🧱 2. Create an Embedding Model
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
embeddings

# 📝 3. Convert a Sample Text into Vector (Embedding)
text = "This is a tutorial on OPENAI embedding"
query_result = embeddings.embed_query(text)
query_result

# 🔢 4. Change Embedding Size (Optional)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", embedding_size=1024)
# You create another version of the embedding model with only 1024 dimensions (smaller size).
# Useful if you want to save memory or speed things up.

# 📄 5. Load Your Document (e.g., speech.txt)
from langchain_community.document_loaders import TextLoader

loader = TextLoader('speech.txt')
docs = loader.load()

# ✂️ 6. Split the Document into Chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
final_documents = text_splitter.split_documents(docs)
# Big documents are hard to process all at once.
# So you split them into smaller pieces (chunks of 500 characters with 50-character overlap).
# Now you have smaller pieces to store in a vector DB.

# 💾 7. Store the Documents as Vectors in a Vector Database (Chroma)
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(final_documents, embeddings)
db
# You store the text chunks as vectors using Chroma (a vector database).
# Each chunk is now searchable by meaning, not just by exact words.

# 🔍 8. Query the Vector Store (Semantic Search)
query = "It will be all the easier for us to conduct ourselves as belligerents"
retrieved_results = db.similarity_search(query)
print(retrieved_results)
# You ask a question or give a query.
# The database searches for text chunks that are similar in meaning, not just keywords.
# It returns the most relevant parts from your document.
