# Create BU information Weaviate database

### What objects are utilized?

1. Webpage objects:

        A Webpage object represents a single webpage and contains an identifier, the url of the webpage, and the html content of the  corresponding webpage
        
2. TextContent objects:

        A TextContent object represents a singular text chunk from its corresponding Webpage object. The text has been converted from html code to readable text. The cleanText property is the only one that has a text embedding.


Webpage objects have references to a list of TextContent objects that represent their respective chunks of the Webpage text content. 




In [1]:
import weaviate
import json
import os
import time
import html2text
import textwrap
import glob
import uuid


WEAV_CLUSTER_URL = "https://bu-cluster-2-o5pekqq0.weaviate.network"
WEAV_API_KEY = "vXNsRxv6vSJ57r0JKOJxhlBwMDIBadbyvjGC"
OPENAI_API_KEY = "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"

DIRECTORY = "/workspaces/BU_Chatbot/Questrom_Course_Info"


# helper function to print json in a pretty way
def prettify(json_dict: dict) -> None:
    """
    This function prints a JSON dictionary in a pretty way.

    Args:
        json_dict: A JSON dictionary
    """
    print(json.dumps(json_dict, indent=2))


def connect_to_weaviate(weav_cluster_url: str, weav_api_key: str, openAI_api_key: str) -> weaviate.Client:
    """
    This function connects to a Weaviate instance.

    Args:
        None
    """
    client = weaviate.Client(
        url=weav_cluster_url,  
        auth_client_secret=weaviate.AuthApiKey(api_key=weav_api_key),  
        additional_headers={
            "X-OpenAI-Api-Key": openAI_api_key
        }
    )

    return client


def delete_all_classes(weaviate_client: weaviate.Client) -> None:
    """
    This function deletes all classes in Weaviate.
    
    Args:
        weaviate_client: A Weaviate client
    """

    weaviate_client.schema.delete_all() # delete all classes


def webpage_class_schema() -> dict:
    """
    This function returns the schema for the Webpage class.
    
    Args:
        None
    """
    webpage_class = {
        "class": "Webpage",
        "description": "A webpage",
        "properties": [
            {
                "name": "webpage_id", 
                "dataType": ["uuid"], 
                "description": "The id of the webpage"},
            {
                "name": "url", 
                "dataType": ["string"], 
                "description": "The url of the webpage"},
            {
                "name": "raw_html", 
                "dataType": ["string"], 
                "description": "The raw html of the webpage"},
            {
                "name": "hasTextContent",
                "dataType": ["TextContent"],
                "description": "The text chunks of the webpage",
            }
        ]
    }

    return webpage_class


def text_content_class_schema() -> dict:
    """
    This function returns the schema for the TextContent class.
    
    Args:
        None
    """
    text_content_class = {
        "class": "TextContent",
        "description": "A chunk of cleaned and readable text from a given webpage",
        "vectorizer": "text2vec-openai",  # For OpenAI
        "properties": [
            {
                "name": "text_id", 
                "dataType": ["uuid"], 
                "description": "The id of the text chunk",
                "moduleConfig": {
                    "text2vec-openai": {  # this must match the vectorizer used
                        'skip': True,
                        'vectorizePropertyName': False
                    }
                }
            },
            {
                "name": "cleanText", 
                "dataType": ["text"],
                "description": "The cleaned text",
                "moduleConfig": {
                    "text2vec-openai": {  # this must match the vectorizer used
                        'skip': False,
                        'vectorizePropertyName': False
                    }
                }
            },
            {
                "name": "hasWebpage",
                "dataType": ["Webpage"],
                "description": "The webpage this text chunk belongs to",
                "moduleConfig": {
                    "text2vec-openai": {  # this must match the vectorizer used
                        'skip': True,
                        'vectorizePropertyName': False
                    }
                }
            },
        ],
    }

    return text_content_class


def create_classes(weaviate_client: weaviate.Client, webpage_class: dict, text_content_class: dict) -> None:
    """
    This function creates the Webpage and TextContent classes in Weaviate.
    
    Args:
        weaviate_client: A Weaviate client
        webpage_class_schema: The schema for the Webpage class
        text_content_class_schema: The schema for the TextContent class
    """

    try:
        weaviate_client.schema.create_class(webpage_class)
    except Exception as exception:
        if str(exception) == """Add properties to classes! Unexpected status code: 422, with response body: {'error': [{'message': "property 'hasTextContent': invalid dataType: reference property to nonexistent class"}]}.""":
            pass  # Ignore the specific error and continue execution
        elif str(exception) == """Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Webpage" already exists'}]}.""":
            print("Class already exists")
        else:
            raise  # Reraise the exception if it doesn't match the specific error

    try:
        weaviate_client.schema.create_class(text_content_class)
    except Exception as exception:
        if str(exception) == """Add properties to classes! Unexpected status code: 422, with response body: {'error': [{'message': "property 'hasWebpage': invalid dataType: reference property to nonexistent class"}]}.""":
            pass  # Ignore the specific error and continue execution
        elif str(exception) == """Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "TextContent" already exists'}]}.""":
            print("Class already exists")
        else:
            raise

    try: 
        weaviate_client.schema.property.create(
            "Webpage", {
                "name": "hasTextContent",
                "dataType": ["TextContent"],
            }
        )
    except Exception as exception:
        if str(exception) == """Add property to class! Unexpected status code: 422, with response body: {'error': [{'message': 'class "hasTextContent": conflict for property "Webpage": already in use or provided multiple times'}]}.""": 
            pass
        else:
            raise


def clean_and_chunk_html(raw_html: str, max_length: int=3000) -> list:
    """
    This function cleans and chunks the raw HTML into a list of strings.
    
    Args:
        raw_html: The raw HTML of a webpage
        max_length: The maximum length of each chunk
    """

    h_1 = html2text.HTML2Text()
    h_1.ignore_links = True
    h_2 = h_1.handle(raw_html)
    clean_text = h_2
    chunks = textwrap.wrap(clean_text, max_length)
    return chunks


def populate_bu_info_weav_db(weaviate_client: weaviate.Client, directory: str) -> list:
    """
    This function populates the BU information Weaviate database.
    
    Args:
        weaviate_client: A Weaviate client
        directory: The directory containing the HTML files
    """

    # Iterate over the first 5 HTML files in the directory

    failed_webpage_urls = []

    for filepath in sorted(glob.glob(os.path.join(directory, '*.html'))):
        try:
            with open(filepath, 'r') as file:
                raw_html = str(file.read())
        except Exception as exception:
            print(f"Failed to read file {filepath}. Error: {exception}")
            continue

        # Get URL from the filename
        url = str(os.path.splitext(os.path.basename(filepath))[0]).replace('_', '/')

        # Unique id for the webpage
        webpage_id = str(weaviate.util.generate_uuid5(raw_html)) # Use the raw HTML as the UUID (Change later?)

        # Create a Webpage object
        webpage_obj = {
            "class": "Webpage",
            "uuid": webpage_id,  # Unique id for the webpage
            "webpage_id": webpage_id,
            "url": url,
            "raw_html": raw_html
        }

        # Add the object to Weaviate
        try:
            weaviate_client.data_object.create(webpage_obj, "Webpage", uuid=webpage_id)
            print(f"Created Webpage object for {url}")
        except Exception as exception:
            failed_webpage_urls.append(url)
            print(f"Failed to create Webpage object. Error: {exception} url: {url}")
            continue

        # Clean and chunk the HTML content (note: )
        text_chunks = clean_and_chunk_html(raw_html)

        # For each chunk, create a TextContent object
        for chunk in text_chunks:
            # Unique id for the text content
            text_id = str(uuid.uuid4())

            text_content_obj = {
                "class": "TextContent",
                "uuid": text_id,  # Unique id for the text content
                "text_id": text_id,  # Unique id for the text content
                "cleanText": chunk
            }

            # Add the object to Weaviate
            try:
                weaviate_client.data_object.create(text_content_obj, "TextContent", uuid=text_id)
                print(f"Created TextContent object for {url}")
            except Exception as exception:
                failed_webpage_urls.append(url)
                print(f"Failed to create TextContent object. Error: {exception}. On URL: {url}")

                time.sleep(900)
                continue

            # OpenAI API has a rate limit of 3 requests per minute
            # time.sleep(10.5)  # 21 second delay, adjust as necessary

            # Crossreference the text content to the webpage
            try: 
                weaviate_client.data_object.reference.add(
                    from_class_name="Webpage",
                    from_uuid=webpage_id,
                    from_property_name="hasTextContent",
                    to_class_name="TextContent",
                    to_uuid=text_id
                )

                weaviate_client.data_object.reference.add(
                    from_class_name="TextContent",
                    from_uuid=text_id,
                    from_property_name="hasWebpage",
                    to_class_name="Webpage",
                    to_uuid=webpage_id,
                )
            except Exception as exception:
                print(f"Failed to crossreference TextContent object. Error: {exception}. On URL: {url}")
                continue

    return failed_webpage_urls


def build_db(weaviate_client: weaviate.Client, directory: str, webpage_class: dict, text_content_class: dict) -> list:
    """
    This function builds the BU information Weaviate database.
    
    Args:
        weaviate_client: A Weaviate client
        directory: The directory containing the HTML files
        webpage_class: The schema for the Webpage class
        text_content_class: The schema for the TextContent class    
    """

    create_classes(weaviate_client, webpage_class, text_content_class)
    failed_webpage_urls = populate_bu_info_weav_db(weaviate_client=weaviate_client, directory=directory)


# Build the BU information Weaviate database
if __name__ == "__main__":

    # Connect to Weaviate
    weav_client = connect_to_weaviate(weav_cluster_url=WEAV_CLUSTER_URL, weav_api_key=WEAV_API_KEY, openAI_api_key=OPENAI_API_KEY)

    # Delete all classes in Weaviate
    delete_all_classes(weaviate_client=weav_client)

    # Define the Webpage and TextContent classes
    # webpage_class_schema = webpage_class_schema()
    # text_content_class_schema = text_content_class_schema()


    # # Build the BU information Weaviate database
    # failed_webpage_urls = build_db(weaviate_client=weav_client, directory=DIRECTORY, webpage_class=webpage_class_schema, text_content_class=text_content_class_schema)

    # # Print the failed webpage URLs
    # print(failed_webpage_urls)