In [None]:
import uuid
import glob
from bs4 import BeautifulSoup
import textwrap
import json
import os
from weaviate import Client

# Create a client
client = Client("http://localhost:8080")

# Directory that contains HTML files
directory = '/workspaces/BU_Chatbot/weavdb_direct'

# Iterate over the first 5 HTML files in the directory
for filepath in glob.glob(os.path.join(directory, '*.html'))[:5]:  # limit to first 5 files
    with open(filepath, 'r') as file:
        raw_html = str(file.read())
        
    # Get URL from the filename
    url = str(os.path.splitext(os.path.basename(filepath))[0]).replace('-', '/')
    
    # Unique id for the webpage
    webpage_id = str(uuid.uuid4())

    # Create a Webpage object
    webpage_obj = {
        "class": "Webpage",
        "url_id": webpage_id,
        "url": url,
        "raw_html": raw_html
    }
    # Add the object to Weaviate
    client.data_object.create(webpage_obj, "Webpage")

In [None]:
#

import html2text
import os
import time
import weaviate
from datetime import datetime
import glob

# Load data
directory = '/workspaces/BU_Chatbot/weavdb_direct'
data = []

# counter for files
count = 0
for filename in os.listdir(directory):
    # stop after 3 files
    if count >= 3:
        break

    with open(os.path.join(directory, filename), 'r') as f:
        url = filename.replace('-', '/') 
        html_content = f.read()
        h1 = html2text.HTML2Text()
        h2 = h1.handle(html_content)
        text_content = h2[:100000]
        html_content = html_content[:100000]

        data.append({
            'url': url,
            'html_content': html_content,
            'text_content': text_content
        })

    count += 1

# Configure a batch process
with client.batch(
    batch_size=100
) as batch:
    # Batch import all Questions
    for i, d in enumerate(data):
        print(f"importing question: {i+1}")

        properties = {
            "id": weaviate.util.generate_uuid5(d["url"]),
            "timestamp": datetime.now().isoformat(),
            "url": d["url"],
            "html_content": d["html_content"]
        }

        # Retry mechanism for rate limit error
        while True:
            try:
                client.batch.add_data_object(
                    properties,
                    "webpage",
                )
                break
            except weaviate.WeaviateBatchError as e:
                if e.status_code == 429:
                    # Rate limit exceeded, wait for 20 seconds and retry
                    print("Rate limit exceeded. Waiting for 30 seconds...")
                    time.sleep(60)
                else:
                    raise

        # Delay for 30 seconds between each request
        time.sleep(1)