In [4]:
! pip install -qU firecrawl-py==0.0.20 langchain_community

In [None]:
! pip install langchain-chroma

In [7]:
! pip install -qU langchain-openai

In [30]:
import os

from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import FireCrawlLoader
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

load_dotenv()

True

Get the API key from FireCrawl

In [17]:
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")

In [18]:
print(firecrawl_api_key)

fc-6dc3e1bdef9841e08757eda9bd0aa815


### Create the directory for the database

In [9]:
current_dir = os.getcwd()
db_dir = os.path.join(current_dir, 'db')
persistent_directory = os.path.join(db_dir, "chroma_db_firecrawl")

### Step 1: Crawl the website using FireCrawl Loader

- Define the url link

In [12]:
url = "https://cloud.google.com/sql?utm_source=google&utm_medium=cpc&utm_campaign=na-CA-all-en-dr-bkws-all-all-trial-e-dr-1707554&utm_content=text-ad-none-any-DEV_c-CRE_678241001169-ADGP_Hybrid+%7C+BKWS+-+MIX+%7C+Txt-Databases-Cloud+SQL-KWID_43700077224547386-kwd-28489936691&utm_term=KW_google%20cloud%20sql-ST_google+cloud+sql&gad_source=1&gclid=Cj0KCQjwo8S3BhDeARIsAFRmkONMaBPN_JxvUIp3-aPW6MjC9o2P4XJx-Hl3oIMeF0GBdykeZWA5Gc4aAlxGEALw_wcB&gclsrc=aw.ds"

In [19]:
loader = FireCrawlLoader(
    api_key=firecrawl_api_key,
    url=url,
    mode="scrape",
)
docs = loader.load()

- Convert with the metadata if they are lists

In [20]:
for doc in docs:
    for key, value in doc.metadata.items():
        if isinstance(value, list):
            doc.metadata[key] = ", ".join(map(str, value))

### Step 2: Split the content into chunks

In [27]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_documents(docs)



- Display the split documents

In [29]:
print(f"Number of document chunks: {len(split_docs)}")
print(f"Sample chunk:\n{split_docs[1].page_content}\n")

Number of document chunks: 122
Sample chunk:
- [Cloud SQL](/sql?utm_source=google&utm_medium=cpc&utm_campaign=na-CA-all-en-dr-bkws-all-all-trial-e-dr-1707554&utm_content=text-ad-none-any-DEV_c-CRE_678241001169-ADGP_Hybrid+%7C+BKWS+-+MIX+%7C+Txt-Databases-Cloud+SQL-KWID_43700077224547386-kwd-28489936691&utm_term=KW_google%20cloud%20sql-ST_google+cloud+sql&gad_source=1&gclid=Cj0KCQjwo8S3BhDeARIsAFRmkONMaBPN_JxvUIp3-aPW6MjC9o2P4XJx-Hl3oIMeF0GBdykeZWA5Gc4aAlxGEALw_wcB&gclsrc=aw.ds#focus-on-your-application-and-leave-the-database-to-us)
- [Features](/sql?utm_source=google&utm_medium=cpc&utm_campaign=na-CA-all-en-dr-bkws-all-all-trial-e-dr-1707554&utm_content=text-ad-none-any-DEV_c-CRE_678241001169-ADGP_Hybrid+%7C+BKWS+-+MIX+%7C+Txt-Databases-Cloud+SQL-KWID_43700077224547386-kwd-28489936691&utm_term=KW_google%20cloud%20sql-ST_google+cloud+sql&gad_source=1&gclid=Cj0KCQjwo8S3BhDeARIsAFRmkONMaBPN_JxvUIp3-aPW6MjC9o2P4XJx-Hl3oIMeF0GBdykeZWA5Gc4aAlxGEALw_wcB&gclsrc=aw.ds#features)
- [Options](/sql

### Step 3: Create embeddings for chunks

In [31]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

### Step 4: Create the presist the vector store

In [33]:
db = Chroma.from_documents(
        split_docs, embeddings, persist_directory=persistent_directory
    )

In [34]:
# Load the vector store with the embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
db = Chroma(persist_directory=persistent_directory,
            embedding_function=embeddings)

### Step 5: Query the vector store

- Create a retriever for querying the vector store

In [35]:
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

- Retrieve relevant documents based on the query


In [36]:
query = "What is the difference between Google Cloud SQL and Google Cloud Spanner?"

In [37]:
relevant_docs = retriever.invoke(query)

- Display the results with metadata

In [38]:
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")


--- Relevant Documents ---
Document 1:
Learn more about the benefits of [Spanner](https://cloud.google.com/spanner), including:

- Strong consistency and global scale
- Run large queries with zero impact on operational traffic
- Highly scalable database with relational and non-relational capabilities, offering 99.999% availability

How It Works

### Cloud SQL scales up in minutes and replicates data across zones and regions. It uses agents for maintenance, logging, monitoring, and configuration, with services backed by a 24/7 SRE team. Manage your database via the console, CLI, or REST API and connect your app via standard database drivers.

[View documentation](https://cloud.google.com/sql/docs)

[![Database Configurations with Cloud SQL video thumbnail](https://lh3.googleusercontent.com/l_l5uO0hQ6Ps6rKE6no4q6SuiOhsrNIUnOvpJH9oNtgIkBRqXXuemjdqqm98dfu2o26tVRTbNuXh=s472-w472)](https://youtu.be/q6noaMAnk5s)

Common Uses

### Database migration

Source: Unknown

Document 2:
| Google Clou