In [12]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure, ReferenceProperty
from sentence_transformers import SentenceTransformer
from yaml import safe_load
from sys import path
import os
path.append(r'C:\Users\Izogie\Desktop\Folders\Projects\Python\KB Chat\src')
from modules.SourceManager import SourceManager

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Load API key from config.yaml (`GOOGLE_API_KEY: <YOUR API TOKEN>`) into environment variable.
# Kept in a seperate file for ease of access across projects.
def load_config(file_path="./config.yml"):
    with open(file_path, 'r') as file:
        config = safe_load(file)
        for key, value in config.items():
            os.environ[key] = value

load_config()

In [4]:
header = {"X-Google-Api-Key": os.environ["GOOGLE_API_KEY"],
          "X-Google-Studio-Api-Key": os.environ["GOOGLE_API_KEY"],
          "X-PaLM-Api-Key": os.environ["GOOGLE_API_KEY"],
          "X-Huggingface-Api-Key": os.environ["HF_API_KEY"],
          "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}

In [5]:
def create_weaviate_schema(client):
    # Delete all existing classes (optional, for a fresh start)
    client.collections.delete_all()

    schema_paragraph = {
                "class": "Paragraph",
                "description": "a paragraph with a header and and parent Article",
                "vectorizer": "text2vec-huggingface",
                "moduleConfig": {
                    "text2vec-openai": {
                        "model": "sentence-transformers/all-MiniLM-L6-v2",
                        "options": {
                            "waitForModel": True,
                            "useGPU": True,
                            "useCache": True
                            },
                    },
                    "generative-palm":{
                        "projectId": "Generative Language Client",
                        "apiEndpoint": "generativelanguage.googleapis.com",
                        "modelId": "gemini-1.5-flash-latest",
                    },

                },
                "properties": [
                    {
                        "dataType": ["text"],
                        "description": "Title/header of the pargraph",
                        "name": "title",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-huggingface": {
                                "skip": False,
                                "vectorizePropertyName": True,
                            }
                        }
                    },
                    {
                        "dataType": ["text"],
                        "description": "paragraph content",
                        "name": "content",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-huggingface": {
                                "skip": False,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["int"],
                        "description": "Order of the paragraph",
                        "name": "order",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-huggingface": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                ]
            }
    schema_article = {
                "class": "Article",
                "description": "A Coppermind article with a title and references",
                "vectorizer": "text2vec-huggingface",
                "moduleConfig": {
                    "text2vec-huggingface": {
                        "model": "sentence-transformers/all-MiniLM-L6-v2",
                        "options": {
                            "waitForModel": True,
                            "useGPU": True,
                            "useCache": True
                            },
                    },
                    "generative-palm":{
                        "projectId": "Generative Language Client",
                        "apiEndpoint": "generativelanguage.googleapis.com",
                        "modelId": "gemini-1.5-flash-latest",
                    },
                },
                "properties": [
                    {
                        "name": "title",
                        "description": "Title of the article",
                        "dataType": ["string"],
                        "indexInverted": True,
                        "moduleConfig":{"text2vec-huggingface":{"skip": False}}
                    },
                    {
                        "name": "hasParagraphs",
                        "description": "List of paragraphs from the article",
                        "dataType": ["Paragraph"],
                    },
                    {
                        "name": "linksToArticles",
                        "description": "Cross-references from the article",
                        "dataType": ["Article"],
                    }
                ]
            }
    for cls in [schema_paragraph,schema_article]:
        client.collections.create_from_dict(cls)
    
    para = client.collections.get("Paragraph")
    para.config.add_reference(
        ReferenceProperty(name="inArticle",
                          target_collection="Article"))
    
    

In [24]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    client.collections.delete_all()
    create_weaviate_schema(client)
    tsts = client.collections.get("Paragraph")
    print(tsts)

<weaviate.Collection config={
  "name": "Paragraph",
  "description": "a paragraph with a header and and parent Article",
  "generative_config": {
    "generative": "generative-palm",
    "model": {
      "apiEndpoint": "generativelanguage.googleapis.com",
      "modelId": "gemini-1.5-flash-latest",
      "projectId": "Generative Language Client"
    }
  },
  "inverted_index_config": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanup_interval_seconds": 60,
    "index_null_state": false,
    "index_property_length": false,
    "index_timestamps": false,
    "stopwords": {
      "preset": "en",
      "additions": null,
      "removals": null
    }
  },
  "multi_tenancy_config": {
    "enabled": false,
    "auto_tenant_creation": false,
    "auto_tenant_activation": false
  },
  "properties": [
    {
      "name": "title",
      "description": "Title/header of the pargraph",
      "data_type": "text",
      "index_filterable": true,
      "index_searchable": true,
      

In [16]:
manager = SourceManager()
articles = manager.load_json("articles.jsonl")

In [17]:
header

{'X-Google-Api-Key': 'AIzaSyCuojhjDSkUYRmAndUXYgQiXwBOpeBScd0',
 'X-Google-Studio-Api-Key': 'AIzaSyCuojhjDSkUYRmAndUXYgQiXwBOpeBScd0',
 'X-PaLM-Api-Key': 'AIzaSyCuojhjDSkUYRmAndUXYgQiXwBOpeBScd0',
 'X-Huggingface-Api-Key': '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',
 'X-OpenAI-Api-Key': 'sk-proj-xVzcxj91LFgUXMZqHrFQT3BlbkFJcbrB4zNGhNi45itzzJ8H'}

In [25]:
from weaviate.util import generate_uuid5


model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in articles[:1]:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        vector = model.encode(art["title"])
        col_articles.data.insert(uuid=art_uuid, properties=art_properties, vector=vector)
        
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections'][:5]:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                title_vector = model.encode(para_properties['title'])
                content_vector = model.encode(para_properties['content'])
                col_paragraphs.data.insert(uuid=para_uuid, properties=para_properties, vector={
                    'title': title_vector,
                    'content': content_vector
                })
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )
    # print(failed_objs)



AttributeError: 'Collection' object has no attribute 'add_reference'

In [None]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    collection = client.collections.get("Article")
    for item in collection.iterator():
        print(item.uuid, item.properties)

59f21e54-f2ee-4f07-81d6-45acf0b2556f {'title': 'Kaladin'}


In [136]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

  from tqdm.autonotebook import tqdm, trange


In [137]:
embeddings

array([[ 6.76569119e-02,  6.34959862e-02,  4.87131625e-02,
         7.93049634e-02,  3.74480709e-02,  2.65275245e-03,
         3.93749885e-02, -7.09843030e-03,  5.93614168e-02,
         3.15370075e-02,  6.00980520e-02, -5.29052801e-02,
         4.06067595e-02, -2.59308498e-02,  2.98428256e-02,
         1.12689065e-03,  7.35148787e-02, -5.03818244e-02,
        -1.22386575e-01,  2.37028543e-02,  2.97265109e-02,
         4.24768552e-02,  2.56337635e-02,  1.99514860e-03,
        -5.69190569e-02, -2.71598138e-02, -3.29035595e-02,
         6.60249069e-02,  1.19007170e-01, -4.58791293e-02,
        -7.26214573e-02, -3.25840563e-02,  5.23413755e-02,
         4.50553223e-02,  8.25305190e-03,  3.67024280e-02,
        -1.39415143e-02,  6.53918609e-02, -2.64272187e-02,
         2.06402605e-04, -1.36643695e-02, -3.62810344e-02,
        -1.95043758e-02, -2.89738402e-02,  3.94270197e-02,
        -8.84090811e-02,  2.62421113e-03,  1.36713935e-02,
         4.83062640e-02, -3.11566275e-02, -1.17329195e-0