In [1]:
import weaviate
import json
import os
from datetime import datetime
import time
import html2text

# Connect to Weaviate client
client = weaviate.Client(
    url="https://bu-cluster-2-o5pekqq0.weaviate.network",  
    auth_client_secret=weaviate.AuthApiKey(api_key="vXNsRxv6vSJ57r0JKOJxhlBwMDIBadbyvjGC"),  
    additional_headers={
        "X-OpenAI-Api-Key": "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"
    }
)

In [2]:
# helper function
def prettify(json_dict): 
    print(json.dumps(json_dict, indent=2))

In [6]:
webpage_class_schema = {
    "class": "Webpage",
    "description": "A webpage",
    "properties": [
        {"name": "url_id", "dataType": ["uuid"], "description": "The id of the webpage"},
        {"name": "url", "dataType": ["string"], "description": "The url of the webpage"},
        {"name": "raw_html", "dataType": ["string"], "description": "The raw html of the webpage"},
        {
            "name": "hasTextContent",
            "dataType": ["TextContent"],
            "description": "The text chunks of the webpage",
        }
    ]
}

text_content_class_schema = {
    "class": "TextContent",
    "description": "A chunk of cleaned and readable text from a given webpage",
    "vectorizer": "text2vec-openai",  # For OpenAI
    "properties": [
        {
            "name": "text_id", 
            "dataType": ["uuid"], 
            "description": "The id of the text chunk",
            "moduleConfig": {
                "text2vec-openai": {  # this must match the vectorizer used
                    'skip': True,
                    'vectorizePropertyName': False
                }
            }
        },
        {
            "name": "cleanText", 
            "dataType": ["text"],
            "description": "The cleaned text",
            "moduleConfig": {
                "text2vec-openai": {  # this must match the vectorizer used
                    'skip': False,
                    'vectorizePropertyName': False
                }
            }
        },
        {
            "name": "hasWebpage",
            "dataType": ["Webpage"],
            "description": "The webpage this text chunk belongs to",
            "moduleConfig": {
                "text2vec-openai": {  # this must match the vectorizer used
                    'skip': True,
                    'vectorizePropertyName': False
                }
            }
        },
    ],
}



In [57]:
schema = client.schema.get() # save schema
client.schema.delete_all() # delete all classes

In [60]:
# Create webpage class

try:
    client.schema.create_class(webpage_class_schema)
except Exception as e:
    if str(e) == """Add properties to classes! Unexpected status code: 422, with response body: {'error': [{'message': "property 'hasTextContent': invalid dataType: reference property to nonexistent class"}]}.""":
        pass  # Ignore the specific error and continue execution
    elif str(e) == """Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Webpage" already exists'}]}.""":
        print("Class already exists")
    else:
        raise  # Reraise the exception if it doesn't match the specific error

try:
    client.schema.create_class(text_content_class_schema)
except Exception as e:
    if str(e) == """Add properties to classes! Unexpected status code: 422, with response body: {'error': [{'message': "property 'hasWebpage': invalid dataType: reference property to nonexistent class"}]}.""":
        pass  # Ignore the specific error and continue execution
    elif str(e) == """Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "TextContent" already exists'}]}.""":
        print("Class already exists")
    else:
        raise

try: client.schema.property.create("Webpage", {
    "name": "hasTextContent",
    "dataType": ["TextContent"],
})
except Exception as e:
    if str(e) == """Add property to class! Unexpected status code: 422, with response body: {'error': [{'message': 'class "hasTextContent": conflict for property "Webpage": already in use or provided multiple times'}]}.""": 
        pass
    else:
        raise

Class already exists
Class already exists
Add property to class! Unexpected status code: 422, with response body: {'error': [{'message': 'class "hasTextContent": conflict for property "Webpage": already in use or provided multiple times'}]}.


In [3]:
prettify(client.schema.get())

{
  "classes": [
    {
      "class": "Webpage",
      "description": "A webpage",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "multiTenancyConfig": {
        "enabled": false
      },
      "properties": [
        {
          "dataType": [
            "uuid"
          ],
          "description": "The id of the webpage",
          "indexFilterable": true,
          "indexSearchable": false,
          "name": "webpage_id"
        },
        {
          "dataType": [
            "text"
          ],
          "description": "The url of the webpage",
          "indexFilterable": true,
          "indexSearchable": true,
          "name": "url",
          "tokenization": "whitespace"
        },
        {
          "dataType": [
            "text"
          ],
          "

In [5]:
import textwrap
import json

# Helper function to clean and chunk text
def clean_and_chunk_html(raw_html, max_length=3000):
    h1 = html2text.HTML2Text()
    h1.ignore_links = True
    h2 = h1.handle(raw_html)
    clean_text = h2
    chunks = textwrap.wrap(clean_text, max_length)
    return chunks

In [6]:
import uuid
import glob
import os

# Directory that containszz a HTML files
directory = '/workspaces/BU_Chatbot/Questrom_Course_Info'

# Ensure client is initialized and connected to Weaviate


# Iterate over the first 5 HTML files in the directory
for filepath in sorted(glob.glob(os.path.join(directory, '*.html'))):  # limit to first 300 files
    try:
        with open(filepath, 'r') as file:
            raw_html = str(file.read())
    except Exception as e:
        print(f"Failed to read file {filepath}. Error: {e}")
        continue
        
    # Get URL from the filename
    url = str(os.path.splitext(os.path.basename(filepath))[0]).replace('-', '/')
    
    # Unique id for the webpage
    webpage_id = str(weaviate.util.generate_uuid5(url))

    # Create a Webpage object
    webpage_obj = {
        "class": "Webpage",
        "uuid": webpage_id,  # Unique id for the webpage
        "url_id": webpage_id,
        "url": url,
        "raw_html": raw_html
    }
    

    # Add the object to Weaviate
    try:
        client.data_object.create(webpage_obj, "Webpage", uuid=webpage_id)
    except Exception as e:
        print(f"Failed to create Webpage object. Error: {e}")
        continue
    

    # Clean and chunk the HTML content
    # Ensure clean_and_chunk_html function is defined
    text_chunks = clean_and_chunk_html(raw_html)

    # For each chunk, create a TextContent object
    for chunk in text_chunks:
        # Unique id for the text content
        text_id = str(uuid.uuid4())

        text_content_obj = {
            "class": "TextContent",
            "uuid": text_id,  # Unique id for the text content
            "text_id": text_id,  # Unique id for the text content
            "cleanText": chunk
        }

        # Add the object to Weaviate
        try:
            client.data_object.create(text_content_obj, "TextContent", uuid=text_id)
        except Exception as e:
            print(f"Failed to create TextContent object. Error: {e}")
            continue
        

        # Give Weaviate some time to process the new object
        time.sleep(21)  # 1 second delay, adjust as necessary

        # Link the text content to the webpage
        
        client.data_object.reference.add(
            from_class_name="Webpage", 
            from_uuid=webpage_id, 
            from_property_name="hasTextContent", 
            to_class_name="TextContent",
            to_uuid=text_content_obj["text_id"]
        )

        client.data_object.reference.add(
            from_class_name="TextContent",
            from_uuid=text_content_obj["text_id"],
            from_property_name="hasWebpage",
            to_class_name="Webpage",
            to_uuid=webpage_id,
        )


Failed to create Webpage object. Error: 45a185c2-149f-5f46-b57a-af04c19714ad
Failed to create Webpage object. Error: 6cb824da-e943-537d-a040-621f90c79343
Failed to create Webpage object. Error: 3e085800-ae40-5da5-98c6-18a5e1f1e607
Failed to create Webpage object. Error: 982cba6d-39cd-5d71-9c7a-764049243ded
Failed to create Webpage object. Error: 720fa6b4-2542-5c74-85bc-d95c14f39872
Failed to create Webpage object. Error: bff8e377-6df8-559d-ad03-f3f1ee355b1c
Failed to create Webpage object. Error: caaaebd3-092b-545d-925a-158e30dbcdda
Failed to create Webpage object. Error: 0ed1ced2-c92f-5e35-9b21-80c8379d5d63
Failed to create Webpage object. Error: 5757cdde-c97d-563c-9a0d-f4fe5b9b616a
Failed to create Webpage object. Error: 1319d0f2-18db-5aa6-8111-000b0cbfefeb
Failed to create Webpage object. Error: c3769124-0228-5f3f-afbb-e8ec58540e77
Failed to create Webpage object. Error: 0e378395-569f-5c6b-a5db-d210a51d5461
Failed to create Webpage object. Error: b57b414c-a136-5f1e-aded-1c6fc5c13617