In [1]:
import os

from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Load environment variables from .env
load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.


True

In [2]:
# Define the persistent directory
current_dir = os.getcwd()
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_apple")


In [3]:
# Step 1: Scrape the content from apple.com using WebBaseLoader
# WebBaseLoader loads web pages and extracts their content
urls = ["https://www.apple.com/"]

# Create a loader for web content
loader = WebBaseLoader(urls)
documents = loader.load()

In [4]:
# Step 2: Split the scraped content into chunks
# CharacterTextSplitter splits the text into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

Created a chunk of size 2003, which is longer than the specified 1000


In [5]:
# Display information about the split documents
print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(docs)}")
print(f"Sample chunk:\n{docs[0].page_content}\n")



--- Document Chunks Information ---
Number of document chunks: 7
Sample chunk:
Apple


Apple

AppleStoreMaciPadiPhoneWatch
VisionAirPodsTV & HomeEntertainmentAccessoriesSupport


0+

 

Apple Intelligence is here.
Experience it now on the latest iPhone, iPad, and Mac models with a free software update.1


MacBook Pro
A work of smart.
Available starting 11.8

Learn more
Pre-order

Hello, Apple Intelligence.


 

Apple Intelligence is here.
Experience it now on the latest iPhone, iPad, and Mac models with a free software update.1


Mac mini
Size down. Power up.
Available starting 11.8

Learn more
Pre-order

Hello, Apple Intelligence.


 

Apple Intelligence is here.
Experience it now on the latest iPhone, iPad, and Mac models with a free software update.1


iMac
Brilllllliant.
Available starting 11.8

Learn more
Pre-order

Hello, Apple Intelligence.


 

iPhone 16 Pro
Hello, Apple Intelligence.

Learn more
Buy


 

iPhone 16
Hello, Apple Intelligence.

Learn more
Buy


 

AirPods Pro 2


In [6]:
# Step 3: Create embeddings for the document chunks
# OpenAIEmbeddings turns text into numerical vectors that capture semantic meaning
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


In [7]:
# Step 4: Create and persist the vector store with the embeddings
# Chroma stores the embeddings for efficient searching
if not os.path.exists(persistent_directory):
    print(f"\n--- Creating vector store in {persistent_directory} ---")
    db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
    print(f"--- Finished creating vector store in {persistent_directory} ---")
else:
    print(f"Vector store {persistent_directory} already exists. No need to initialize.")
    db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)


--- Creating vector store in /home/g/Documentos/LangChain_Curso/04_RAGS/db/chroma_db_apple ---
--- Finished creating vector store in /home/g/Documentos/LangChain_Curso/04_RAGS/db/chroma_db_apple ---


In [8]:
# Step 5: Query the vector store
# Create a retriever for querying the vector store
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

In [9]:
# Define the user's question
query = "What new products are announced on Apple.com?"

# Retrieve relevant documents based on the query
relevant_docs = retriever.invoke(query)

In [10]:
# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")


--- Relevant Documents ---
Document 1:
Hearing Test, Hearing Aid, and Hearing Protection features in a free software update.2


Learn more
Buy


 

Apple Intelligence
AI for the rest of us.

Learn more
Watch the film


 

Apple Trade In
Get $180-$650 in credit when you trade in iPhone 12 or higher.3

Get your estimate


 

Apple Card
Get up to 3% Daily Cash back with every purchase.

Learn more
Apply now
Apply now


Apple TV+

FAM Gallery

Listen now


Puro Pop

Play now

NBA 2K25 Arcade Edition

Watch now

Run Your First 5K

Listen now


The Crate (Halloween Edition)

Play now

Hello Kitty Island Adventure

Watch now

HIIT with Brian


Apple Footer

Source: https://www.apple.com/

Document 2:
Apple


Apple

AppleStoreMaciPadiPhoneWatch
VisionAirPodsTV & HomeEntertainmentAccessoriesSupport


0+

 

Apple Intelligence is here.
Experience it now on the latest iPhone, iPad, and Mac models with a free software update.1


MacBook Pro
A work of smart.
Available starting 11.8

Learn more
Pre