In [56]:
from typing import List, Dict
from weaviate.classes.config import Property, DataType
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore

document_store = WeaviateDocumentStore(url="http://localhost:8080")

# Create a class for visited URLs in Weaviate (if not already created)
visited_urls_class = "VisitedURL"
if not document_store.client.collections.exists(visited_urls_class):
    document_store.client.collections.create(
                visited_urls_class,
                properties=[
                    Property(name="url", data_type=DataType.TEXT),
                    Property(name="is_visited", data_type=DataType.BOOL),
                ])

In [57]:
from weaviate.classes.query import Filter

# Step 2: Define Helper Function to Check and Store Visited URLs
def fetch_urls() -> List[Dict[str, str]]:
    """Fetch not visited urls."""
    collection = document_store.client.collections.get(visited_urls_class)
    result = collection.query.fetch_objects(
        filters=Filter.by_property("is_visited").equal(False))
    return [{ "uuid": str(not_visited.uuid), "url": not_visited.properties["url"]} for not_visited in result.objects]

def mark_url_as_visited(uuid):
    """Mark a URL as visited."""
    collection = document_store.client.collections.get(visited_urls_class)
    collection.data.update(uuid=uuid, properties={"is_visited": True})

def add_url(url):
    """Add URL to Weaviate."""
    collection = document_store.client.collections.get(visited_urls_class)
    result = collection.query.fetch_objects(
        filters=Filter.by_property("url").equal(url))
    if len(result.objects) == 0:
        collection.data.insert({"url": url, "is_visited": False})

In [58]:
from haystack.components.fetchers import LinkContentFetcher
from haystack import component
from haystack.dataclasses.byte_stream import ByteStream


# Step 3: Define the Fetcher with Visited URL Logic
class CustomLinkContentFetcher(LinkContentFetcher):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    @component.output_types(streams=List[ByteStream])
    def run(self, urls: List[Dict[str, str]]):
        """Override fetch to handle visited URLs."""
        for url in urls:
            results = super().run([url["url"] for url in urls])
            mark_url_as_visited(url["uuid"])  # Mark URLs as visited after fetching
        return results


In [59]:
 
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder

fetcher = CustomLinkContentFetcher(retry_attempts = 2, timeout = 3)
converter = HTMLToDocument()
writer = DocumentWriter(document_store = document_store)
doc_embedder = FastembedDocumentEmbedder(
    model="BAAI/bge-small-en-v1.5",
    batch_size=256,
)

doc_embedder.warm_up()

In [60]:
from haystack import Pipeline

indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=fetcher, name="fetcher")
indexing_pipeline.add_component(instance=converter, name="converter")
indexing_pipeline.add_component(instance=doc_embedder, name="embedder")
indexing_pipeline.add_component(instance=writer, name="writer")

indexing_pipeline.connect("fetcher.streams", "converter.sources")
indexing_pipeline.connect("converter.documents", "embedder.documents")
indexing_pipeline.connect("embedder.documents", "writer.documents")

seed_urls = ["https://fresh.deno.dev/docs/concepts/islands"]
for url in seed_urls:
    add_url(url)


urls = fetch_urls()
print(urls)
while len(urls) > 0:
    indexing_pipeline.run(data={"fetcher": {"urls": urls}})
    urls = fetch_urls()

[]


In [61]:
[doc.to_dict() for doc in document_store.filter_documents()]

[{'id': 'd16106b0f9caa7df0d7f2ebf828a4d2199ed2b1d006098eddc59f3eabbfc6163',
  'content': 'Interactive islands\nIslands enable client side interactivity in Fresh. Islands are isolated Preact components that are rendered on the server and then hydrated on the client. This is different from all other components in Fresh, as they are usually rendered on the server only.\nIslands are defined by creating a file in the islands/\nfolder in a Fresh\nproject. The name of this file must be a PascalCase or kebab-case name of the\nisland.\nimport { useSignal } from "@preact/signals";\nexport default function MyIsland() {\nconst count = useSignal(0);\nreturn (\n<div>\nCounter is at {count}.{" "}\n<button onClick={() => (count.value += 1)}>+</button>\n</div>\n);\n}\nAn island can be used in a page like a regular Preact component. Fresh will take care of automatically re-hydrating the island on the client.\nimport MyIsland from "../islands/my-island.tsx";\nexport default function Home() {\nreturn <MyI