In [1]:

from dotenv import dotenv_values
env = dotenv_values()

import weaviate
import weaviate.classes as wvc
import os
import json

wv_headers = {
    "X-OpenAI-Api-key": env["OPENAI_API_KEY"]
}
weaviate_client = weaviate.connect_to_local(headers=wv_headers)
with open("schema.json", "r") as f:
    schemas = json.loads(f.read())
    
weaviate_client.collections.delete_all()
for schema in schemas["classes"]: 
    try:
        weaviate_client.collections.create_from_dict(schema)
    except Exception as error:
        print(f"Failed to create {schema['class']}")
        print(error)
weaviate_client.close()


In [2]:

import os
import json
from dotenv import dotenv_values
env = dotenv_values()

import weaviate
import weaviate.classes as wvc
from typing import List, Dict
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter, MarkdownTextSplitter
from langchain.docstore.document import Document

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
    ("#####", "Header 5"),
    ("######", "Header 6"),
]

def source_title_parser(source: str) -> str:
    return os.path.splitext(os.path.basename(source))[0]

def mdhsplitter_reformatter(doc: Document) -> str:
    headers = {k: v for v, k in headers_to_split_on}
    text = doc.page_content
    for header, v in headers.items():
        if header not in doc.metadata.keys():
            continue
        text = f"{v} {doc.metadata[header]}\n" + text

    return text

dir_loader = DirectoryLoader(
    path=os.path.join("data/content"),
    loader_cls=TextLoader,
)
mdhsplitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
    # strip_headers=False,
)

wv_headers = {
    "X-OpenAI-Api-key": env["OPENAI_API_KEY"]
}
weaviate_client = weaviate.connect_to_local(headers=wv_headers)
collection = weaviate_client.collections.get("scheme_chunks")

docs = dir_loader.load()

with collection.batch.dynamic() as batch:
    for doc in docs:
        doc_chunks: List[Document] = mdhsplitter.split_text(doc.page_content)
        title = source_title_parser(doc.metadata["source"])
        for chunk in doc_chunks:
            text: str = mdhsplitter_reformatter(chunk)
            batch.add_object(
                properties={
                    "content_chunks": text,
                    "title": title,
                    "source": doc.metadata["source"],
                }
            )

collection = weaviate_client.collections.get("whole_scheme")
with collection.batch.dynamic() as batch:
    for doc in docs:
        batch.add_object(
            properties={
                "content": doc.page_content,
                "title": source_title_parser(doc.metadata["source"]),
                "source": doc.metadata["source"],
            }
        )
weaviate_client.close()


In [29]:

import weaviate
import weaviate.classes as wvc
import os
import json

weaviate_client = weaviate.Client("http://localhost:8080")

with open("schema.json", "r") as f:
    schema = json.loads(f.read())
    print(type(schema))
# schema = {
#     "classes": [
#         {
#             "class": "Schemes",
#             "description": "Singapore Government Schemes in chunks",
#             "vectorizer": "text2vec-openai",
#             "moduleConfig": {
#                 "text2vec-openai": {
#                     "model": "ada",
#                     "modelVersion": "002",
#                     "type": "text"
#                 }
#             },
#             "properties": [
#                 {
#                     "name": "content_chunks",
#                     "dataType": ["text"],
#                     "description": "Scheme content"
#                 },
#                 {
#                     "name": "title",
#                     "dataType": ["text"],
#                     "description": "Title of the scheme",
#                     "moduleConfig": {
#                         "text2vec-openai": {
#                             "skip": True
#                         }
#                     }
#                 },
#                 {
#                     "name": "source",
#                     "dataType": ["text"],
#                     "description": "URL source of scheme",
#                     "moduleConfig": {
#                         "text2vec-openai": {
#                             "skip": True
#                         }
#                     }
#                 }
#             ]
#         }
#     ]
# }

weaviate_client.schema.delete_all()
weaviate_client.schema.create(schema)
results = weaviate_client.schema.get()

print(len(results["classes"]))


<class 'dict'>
1


            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            
