In [1]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure, ReferenceProperty
import json
from sys import path
path.append(r'C:\Users\Izogie\Desktop\Folders\Projects\Python\KB Chat\src')
from modules.SourceManager import SourceManager

In [None]:
schema = {
    "classes":[
        {
            "class": "Article",
            "description": "A Coppermind article with a title and crefs",
            "vectorizer": "none",
            "vectorIndexConfig": {
                "skip": True
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "Title of the article",
                    "name": "title",
                    "indexInverted": True
                },
                {
                    "dataType": ["Paragraph"],
                    "description": "List of paragraphs form the article",
                    "name": "paragraphs",
                    "indexInverted": True
                },
                {
                    "dataType": ["Article"],
                    "description": "Cross-references from the article",
                    "name": "links",
                    "indexInverted": True
                },
            ]
        },
        {
            "class": "Paragraph",
            "description": "a paragraph with a header and and parent Article",
            "vectorizer": "text2vec-palm",
            "vectorIndexConfig": {
                "vectorCacheMaxObjects": 150000000000,
                "ef": 256,
                "efConstruction": 512,
                "maxConnections": 128
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "Title/header of the pargraph",
                    "name": "title",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-transformers": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["text"],
                    "description": "paragraph content",
                    "name": "content",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["int"],
                    "description": "Order of the paragraph",
                    "name": "order",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["Article"],
                    "description": "Article this paragraph is in",
                    "name": "inArticle",
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                }
            ]
        }
    ]
}

In [9]:
class_obj = {
  "class": "Paragraph",
  "description": "A paragraph from the Coppermind wiki",
  "vectorizer": "text2vec-palm",
  "properties": [
    {
      "dataType": ["string"],
      "description": "Title of the section this paragraph is in",
      "name": "section_title"
    },
    {
      "dataType": ["text"],
      "description": "Content of the paragraph",
      "name": "content"
    },
    {
      "dataType": ["string"],
      "description": "Page this paragraph is from",
      "name": "page_title"
    }
  ]
}

with weaviate.connect_to_local() as client:
  client.collections.create_from_dict(class_obj)

In [33]:
manager = SourceManager()
articles = manager.load_json("articles.jsonl")
with weaviate.connect_to_local() as client:
    create_weaviate_schema(client)
    

In [8]:
def create_weaviate_schema(client):
    # Delete all existing classes (optional, for a fresh start)
    client.collections.delete_all()

    schema_paragraph = {
                "class": "Paragraph",
                "description": "a paragraph with a header and and parent Article",
                "properties": [
                    {
                        "dataType": ["text"],
                        "description": "Title/header of the pargraph",
                        "name": "title",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-transformers": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["text"],
                        "description": "paragraph content",
                        "name": "content",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-palm": {
                                "skip": False,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["int"],
                        "description": "Order of the paragraph",
                        "name": "order",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-palm": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                ]
            }
    schema_article = {
                "class": "Article",
                "description": "A Coppermind article with a title and references",
                "vectorizer": "text2vec-palm",  # Added vectorizer to enable search
                # "moduleConfig": {
                #     "text2vec-palm": {
                #         "vectorizeClassName": False
                #     }
                # },
                "properties": [
                    {
                        "name": "title",
                        "description": "Title of the article",
                        "dataType": ["string"],
                        "indexInverted": True,
                        "moduleConfig":{"text2vec-palm":{"skip": True}}
                    },
                    # {
                    #     "name": "hasParagraphs",
                    #     "description": "List of paragraphs from the article",
                    #     "dataType": ["Paragraph"],
                    # },
                    # {
                    #     "name": "linksToArticles",
                    #     "description": "Cross-references from the article",
                    #     "dataType": ["Article"],
                    # }
                ]
            }
    for cls in [schema_paragraph]:
        client.collections.create_from_dict(cls)
    
    # paragraphs = client.collections.get("Paragrah")
    # prop_para_parent = {
    #     "dataType": ["Article"],
    #     "description": "Article this paragraph is in",
    #     "name": "parent",
    #     "moduleConfig": {
    #         "text2vec-palm": {
    #             "skip": True,
    #             "vectorizePropertyName": False,
    #         }
    #     }
    # }
    # paragraphs

In [9]:
with weaviate.connect_to_local() as client:
    client.connect()
    # client.schema.get()
    create_weaviate_schema(client)
    paragraphs = client.collections.get("Paragrah")
    # print(client.get_meta())
    # paragraphs.config.get()
    # client.collections.export_config("Article")
#     client.collections.get("Article")
#     prop_para_parent = {
#         "dataType": ["Article"],
#         "description": "Article this paragraph is in",
#         "name": "parent",
#         "moduleConfig": {
#             "text2vec-palm": {
#                 "skip": True,
#                 "vectorizePropertyName": False,
#             }
#         }
#     }
    print(paragraphs)
    # paragraphs.config.add_reference(
    #     ReferenceProperty(name="inArticle",
    #                       target_collection="Article")
    # )
    

UnexpectedStatusCodeError: Collection configuration could not be retrieved.! Unexpected status code: 404, with response body: None.

In [10]:
with weaviate.connect_to_local() as client:
    client.collections.create(
        name="Test",
        vectorizer_config=Configure.NamedVectors.text2vec_palm,
        properties=[
            Property(
                name="title",
                data_type=DataType.TEXT,
                description="Title/header of the paragraph",
                index_inverted=True
            ),
            Property(
                name="content",
                data_type=DataType.TEXT,
                description="Paragraph content"
            ),
            Property(
                name="order",
                data_type=DataType.INT,
                description="Order of the paragraph"
            )
        ],
        vector_index_config={
            "vectorCacheMaxObjects": 150000000000,
            "ef": 256,
            "efConstruction": 512,
            "maxConnections": 128
        }
    )

ValidationError: 4 validation errors for _CollectionConfigCreate
vector_index_config.distance
  Field required [type=missing, input_value={'vectorCacheMaxObjects':..., 'maxConnections': 128}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/missing
vector_index_config.quantizer
  Field required [type=missing, input_value={'vectorCacheMaxObjects':..., 'maxConnections': 128}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/missing
vectorizer_config._VectorizerConfigCreate
  Input should be a valid dictionary or instance of _VectorizerConfigCreate [type=model_type, input_value=<function _NamedVectors.t...m at 0x000001A2FDF20040>, input_type=function]
    For further information visit https://errors.pydantic.dev/2.6/v/model_type
vectorizer_config.list[_NamedVectorConfigCreate]
  Input should be a valid list [type=list_type, input_value=<function _NamedVectors.t...m at 0x000001A2FDF20040>, input_type=function]
    For further information visit https://errors.pydantic.dev/2.6/v/list_type