# For fire crawl
<hr>

- This model is built based on llamaindex and firecrawl

In [1]:
import os

from dotenv import load_dotenv
load_dotenv()

True

In [30]:
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')
if not os.getenv('OPENAI_API_KEY'):
    raise ValueError("OPENAI_API_KEY not found in environment variables")

- set the fire crawl api key

In [2]:
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
if not firecrawl_api_key:
    raise ValueError("FIRECRAWL_API_KEY environment variable not set")

- create the vector database

- scrape from the website

In [None]:
! pip install firecrawl-py==0.0.20 llama_index llama-index llama-index-readers-web

- put the url here

In [172]:
url_1 = "https://aws.amazon.com/ec2/instance-types/"
url_2 = "https://spot.io/resources/azure-pricing/the-complete-guide/"

- URL 1

In [20]:
url = url_1

URL 2

In [173]:
url = url_2

In [174]:
from llama_index.readers.web import FireCrawlWebReader
from llama_index.core import SummaryIndex

# Initialize the FireCrawlWebReader with your API key and desired mode
firecrawl_reader = FireCrawlWebReader(
    api_key=firecrawl_api_key,  # Replace with your actual API key from https://www.firecrawl.dev/
    mode="scrape",  # Choose between "crawl" and "scrape"
    # params={"additional": "parameters"}  # Optional additional parameters
)

# Load documents from a specified URL
documents = firecrawl_reader.load_data(url)


- If you want to directly use the documents embedding to index

In [198]:
index = SummaryIndex.from_documents(documents)

- The crawled data with metadata which is not able to store in chroma db

In [175]:
# Documents generated from the URL with metadata
documents[0]

Document(id_='78ef1476-77e8-4529-b575-2d47b0e8c3f1', embedding=None, metadata={'title': 'Azure Pricing: The Complete Guide | Spot.io', 'description': 'How Much Does Azure Cost Per Month? Microsoft Azure is a cloud provider offering a diverse range of services, including storage, networking, compute, and', 'language': 'en', 'robots': 'index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1', 'ogTitle': 'Azure Pricing: The Complete Guide', 'ogDescription': 'How Much Does Azure Cost Per Month? Microsoft Azure is a cloud provider offering a diverse range of services, including storage, networking, compute, and', 'ogUrl': 'https://spot.io/resources/azure-pricing/the-complete-guide/', 'ogImage': 'https://spot.io/wp-content/themes/nylon/assets/img/social-share-thumbnail.png', 'ogLocale': 'en_US', 'ogLocaleAlternate': [], 'ogSiteName': 'Spot.io', 'modifiedTime': '2024-09-24T17:22:56+00:00', 'sourceURL': 'https://spot.io/resources/azure-pricing/the-complete-guide/', 'pageSt

In [176]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x327ce9710>

### - The method that can remove the metadata from the documents

In [177]:
def clean_metadata(metadata):
    """Ensure metadata only contains str, int, float, or None."""
    for key, value in metadata.items():
        if not isinstance(value, (str, int, float, type(None))):
            # Convert lists and other types to a string
            metadata[key] = str(value)
    return metadata

- delete the metadata

In [178]:
for doc in documents:
    doc.metadata = clean_metadata(doc.metadata)  # Clean the metadata


In [155]:
documents[0]

Document(id_='00a4fc9c-a1a4-45f4-9748-0f994c7d1c7c', embedding=None, metadata={'page_label': '1', 'file_name': 'Haida_bracelet.pdf', 'file_path': '/Users/chris/Desktop/7980/CS7980_Capstone_RBCMuseum/llamaindex/data/Haida_bracelet.pdf', 'file_type': 'application/pdf', 'file_size': 311842, 'creation_date': '2024-09-23', 'last_modified_date': '2024-09-23'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Haida bracelet\n Date:  1980 Record:  RBCM 18826 Materials:  silver Artist:  Robert Davidson guud san glans, Robert Davidson (born 1946) is a great grandson of the Haida artist, Charles Edenshaw, and carries on Edenshaw’s legacy of innovation combined with traditional craftsmanship. This beautifully designed and executed silver bracelet, titled Happy Negati

Define the embedding model(Optional for latter comparision)

In [None]:
! pip install llama-index-embeddings-huggingface
! pip install llama-index-embeddings-instructor

In [57]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# define embedding function
embed_model = HuggingFaceEmbedding(model_name= "BAAI/bge-base-en-v1.5")

### Using Chroma DB to store the data
<hr>

In [None]:
! pip install llama-index-stores-chroma
! pip install chroma


### Persist: Saving to Disk

- Create the folder

In [79]:
import chromadb
db = chromadb.PersistentClient(path="/Users/chris/Desktop/7980/CS7980_Capstone_RBCMuseum/llamaindex/chroma_db")

- create a collection
- for each new document that want to be embeded and stored, need to create a new collection for it
- `webpage2` should be dynamically alterring

In [179]:
chroma_collection = db.get_or_create_collection("webpage2")

In [180]:
from llama_index.vector_stores.chroma import ChromaVectorStore
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [181]:
from llama_index.core import StorageContext
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [195]:
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

- put the index into the vectore store

In [203]:
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_documents(
    documents, embed_model=embed_model,storage_context=storage_context
)

<hr>

### TEST
<hr>

In [204]:
query_engine = index.as_query_engine()

In [185]:
test_question_1 = "What is instance price of  Azure,?"

In [186]:
response_1 = query_engine.query(test_question_1)

In [187]:
from llama_index.core.response.pprint_utils import pprint_response
pprint_response(response_1, show_source=True)

Final Response: The instance price of Azure varies depending on the
pricing model chosen. For pay-as-you-go pricing, services are billed
per second with no long-term commitment or upfront payments. Reserved
Instances offer a discount of up to 72% compared to pay-as-you-go
prices when virtual machines are pre-purchased for one or three years
in a specific region. Spot Pricing allows users to buy unused
computing power at a discount of up to 90% compared to pay-as-you-go
prices, but these spot instances can be interrupted on short notice.
______________________________________________________________________
Source Node 1/2
Node ID: fe7a7c58-e889-494d-a2cc-88df501b7cb5
Similarity: 0.4983790253029297
Text: ### Pay as You Go  You can pay for services on Azure according
to actual usage, billed per second, with no long-term commitment or
upfront payments. This provides complete flexibility to increase or
decrease resources as needed. Azure virtual machines (VMs) can be
automatically scaled u

In [205]:
test_question_2 = "What is instance price of Azure, instead of give me the link, just summarize what you have from the database?"

In [206]:
response_2 = query_engine.query(test_question_2)

In [207]:
pprint_response(response_2, show_source=True)

Final Response: Azure offers three main pricing models for instances:
Pay as You Go, Reserved Instances, and Spot Pricing. Pay as You Go
allows for flexible billing based on actual usage with no long-term
commitment. Reserved Instances involve pre-purchasing virtual machines
for one or three years in a specific region, offering discounts of up
to 72%. Spot Pricing allows users to buy unused computing power at a
discount of up to 90%, but these spot instances can be interrupted on
short notice. Each pricing model caters to different usage scenarios
and preferences.
______________________________________________________________________
Source Node 1/1
Node ID: 5c554a28-ea8f-45b7-98e9-8bf4cf530efe
Similarity: 0.47218622829484785
Text: ### Pay as You Go  You can pay for services on Azure according
to actual usage, billed per second, with no long-term commitment or
upfront payments. This provides complete flexibility to increase or
decrease resources as needed. Azure virtual machines (VMs) 

In [191]:
test_question_3 = "What is the Haida bracelet"

In [192]:
response_3 = query_engine.query(test_question_3)

In [193]:
pprint_response(response_3, show_source=True)

Final Response: The Haida bracelet is not mentioned in the provided
context information.
______________________________________________________________________
Source Node 1/2
Node ID: de6e633b-2ba7-48fc-94ee-b9a2de626563
Similarity: 0.15706084283497512
Text: [](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg
'%20viewBox='0%200%2020%2020'%3E%3C/svg%3E)Google CloudCloud
optimization for Google Cloud](https://spot.io/solutions/google-cloud-
platform/)   - [![](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.o
rg/2000/svg'%20viewBox='0%200%2020%2020'%3E%3C/svg%3E)Container
infrastructureDeli...
______________________________________________________________________
Source Node 2/2
Node ID: e9c48505-a460-4328-bb6b-c9ea7f5c17c5
Similarity: 0.15565416160770623
Text: [](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg
'%20viewBox='0%200%2026%2020'%3E%3C/svg%3E)Elastigroup](https://spot.i
o/product/elastigroup/)   - Kubernetes     - [![](data:image/svg+xml,%
3Csvg%20

- Customized Setting for the vector storage (Optional for latter comparision)

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900