In [1]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure, ReferenceProperty
from weaviate.util import generate_uuid5
from sentence_transformers import SentenceTransformer
from weaviate.classes.query import QueryReference
from yaml import safe_load
import json
import os
from sys import path
path.append(r'C:\Users\Izogie\Desktop\Folders\Projects\Python\KB Chat\src')
from modules.SourceManager import SourceManager

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load API key from config.yaml (`GOOGLE_API_KEY: <YOUR API TOKEN>`) into environment variable.
# Kept in a seperate file for ease of access across projects.
from yaml import safe_load
def load_config(file_path="./config.yml"):
    with open(file_path, 'r') as file:
        config = safe_load(file)
        for key, value in config.items():
            os.environ[key] = value
load_config()

In [3]:
header = {
    "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
}

In [30]:
def create_weaviate_schema_openai(client):
    # Delete all existing classes (optional, for a fresh start)
    client.collections.delete_all()

    schema_paragraph = {
                "class": "Paragraph",
                "description": "a paragraph with a header and and parent Article",
                "vectorizer": "none",
                "moduleConfig": {
                    "text2vec-openai": {
                        "model": "text-embedding-3-large",
                    },

                },
                "properties": [
                    {
                        "dataType": ["text"],
                        "description": "Title/header of the pargraph",
                        "name": "title",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-openai": {
                                "skip": False,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["text"],
                        "description": "paragraph content",
                        "name": "content",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-openai": {
                                "skip": False,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["int"],
                        "description": "Order of the paragraph",
                        "name": "order",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-openai": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                ]
            }
    schema_article = {
                "class": "Article",
                "description": "A Coppermind article with a title and references",
                "vectorizer": "none",
                "moduleConfig": {
                    "text2vec-openai": {
                        "model": "text-embedding-3-large",
                    },
                },
                "properties": [
                    {
                        "name": "title",
                        "description": "Title of the article",
                        "dataType": ["string"],
                        "indexInverted": True,
                        "moduleConfig":{"text2vec-openai":{"skip": False}}
                    },
                    {
                        "name": "hasParagraphs",
                        "description": "List of paragraphs from the article",
                        "dataType": ["Paragraph"],
                    },
                    {
                        "name": "linksToArticles",
                        "description": "Cross-references from the article",
                        "dataType": ["Article"],
                    }
                ]
            }
    for cls in [schema_paragraph,schema_article]:
        client.collections.create_from_dict(cls)
    
    paragraphs = client.collections.get("Paragraph")
    paragraphs.config.add_reference(
        ReferenceProperty(
            name="inArticle",
            target_collection="Article"
        )
    )
    # prop_para_parent = {
    #     "dataType": ["Article"],
    #     "description": "Article this paragraph is in",
    #     "name": "parent",
    #     "moduleConfig": {
    #         "text2vec-palm": {
    #             "skip": True,
    #             "vectorizePropertyName": False,
    #         }
    #     }
    # }
    # paragraphs

In [14]:
def show_collection_refs(client, collection_name, ref_name, return_properties):
    query_col = client.collections.get(collection_name)
    response = query_col.query.fetch_objects(
        return_references=[
            QueryReference(link_on=ref_name, return_properties=return_properties)])

    for o in response.objects:
        print(o.properties["title"])
        if ref_name in o.references:
            for ref_obj in o.references[ref_name].objects:
                print(ref_obj.properties)
        else: continue

In [8]:
def fresh_start(client):
    client.collections.delete_all()
    create_weaviate_schema_openai(client)

In [9]:
def insert_without_links(client, articles):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in articles:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        vector = model.encode(art["title"])
        col_articles.data.insert(uuid=art_uuid, properties=art_properties, vector=vector)
            
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections']:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                title_vector = model.encode(para_properties['title'])
                content_vector = model.encode(para_properties['content'])
                col_paragraphs.data.insert(
                    uuid=para_uuid, 
                    properties=para_properties, 
                    vector={
                        'title': title_vector,
                        'content': content_vector
                    })
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )



# Initial tests

In [31]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    fresh_start(client)
    tsts = client.collections.get("Paragraph")
    print(tsts)

<weaviate.Collection config={
  "name": "Paragraph",
  "description": "a paragraph with a header and and parent Article",
  "generative_config": null,
  "inverted_index_config": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanup_interval_seconds": 60,
    "index_null_state": false,
    "index_property_length": false,
    "index_timestamps": false,
    "stopwords": {
      "preset": "en",
      "additions": null,
      "removals": null
    }
  },
  "multi_tenancy_config": {
    "enabled": false,
    "auto_tenant_creation": false,
    "auto_tenant_activation": false
  },
  "properties": [
    {
      "name": "title",
      "description": "Title/header of the pargraph",
      "data_type": "text",
      "index_filterable": true,
      "index_searchable": true,
      "nested_properties": null,
      "tokenization": "word",
      "vectorizer_config": null,
      "vectorizer": "none"
    },
    {
      "name": "content",
      "description": "paragraph content",
      "data_

In [32]:
manager = SourceManager()
data = manager.load_json("articles.jsonl")

In [27]:
page_titles = [
    "Kaladin",
    "Pits of Hathsin",
    "Allomancy",
    "Cosmere",
    "Honor's Perpendicularity",
    "Cephandrius",
    "Hoid"
]
data = manager.prep_data_graph(page_titles)

MWClient Connected with coppermind.net
Processing Kaladin
Processing Pits of Hathsin
Processing Allomancy
Processing Cosmere
Processing Honor's Perpendicularity
Processing Cephandrius
Processing Hoid
articles.jsonl saved
articles.jsonl saved


In [33]:
data[-1]

{'title': 'Hoid',
 'links': ['ref-wob-8909',
  'coppermind',
  'Nightblood',
  'Cognitive Realm',
  'ref-wob-8498',
  'ref-wob-8185',
  'Vivenna',
  'ref-epi-sa4-25',
  'ref-wob-5016',
  'Rayse',
  'Whistlebow',
  'Cryptic',
  'ref-wob-8343',
  'Night Brigade',
  'ref-wob-10439',
  'Harmony',
  'ref-book-Yumi-part=anotherepilogue',
  'ref-wob-9465',
  'ref-wob-7600',
  'ref-book-sa2-46',
  'Shardplate',
  'Mistborn',
  'Jasnah Kholin',
  'Illumination',
  'ref-book-sa2-55',
  'ref-wob-10630',
  'ref-book-twok-epilogue',
  'ref-wob-7323',
  'ref-wob-2567',
  'Jah Keved',
  'ref-wob-15354',
  'ref-wob-12011',
  'First of the Sun',
  'Synod',
  'Liyun',
  'ref-book-Yumi-32',
  'sprouter',
  'Vessel',
  'ref-wob-3437',
  'ref-wob-1253',
  "Devotion's Perpendicularity",
  'ref-book-wok-57',
  'Realmatic Theory',
  'ref-wob-8899',
  'Ruthar',
  'ref-book-sa1-i-1',
  'Nikaro',
  'Diem',
  'Cognitive Shadow',
  'visions',
  'Bavadin',
  'Silverlight Mercantile',
  'Dalinar Kholin',
  'Obrodai'

In [34]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    fresh_start(client) # delete old entries. Start fresh
    
    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in data:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        col_articles.data.insert(uuid=art_uuid, properties=art_properties)
        
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections']:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                col_paragraphs.data.insert(
                    uuid=para_uuid, 
                    properties=para_properties, 
                    )
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )



No sections: Cephandrius


In [15]:
from weaviate.classes.query import QueryReference
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    # show_collection_refs(client, "Article", "hasParagraphs", "title")
    collection_name = "Article"
    ref_name = "hasParagraphs"
    return_properties ="title"
    show_collection_refs(client, collection_name=collection_name, ref_name=ref_name, return_properties=return_properties)

Tanavast
{'title': 'Tanavast'}
{'title': 'Appearance'}
{'title': 'Personality'}
{'title': 'Personality'}
{'title': 'Personality'}
{'title': 'Personality'}
{'title': 'Personality'}
{'title': 'Attributes and Abilities'}
{'title': 'Before the Shattering'}
{'title': 'Before the Shattering'}
{'title': 'The Shattering'}
Urithiru
{'title': 'Urithiru'}
{'title': 'Location'}
{'title': 'Location'}
{'title': 'Location'}
{'title': 'Location'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Appearance'}
{'title': 'Bathhouse where Vedekar Perel\'s body is found: "spiraled, twisting around the floor, ceiling, and walls like the threads of a screw"; "curving, twisting medley of oranges, reds, and browns—ballooning out across the sides of this chamber in wide bands before coiling back into narrow stripe

In [45]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    collection = client.collections.get("Paragraph")
    for item in collection.query.fetch_objects(include_vector=True).objects:
        print(item.uuid, item.vector)

00d70d1e-123e-54d9-ae76-4677bba99527 {}
0108d10d-2b11-5aff-a14b-20c6b5fb1e65 {}
010f6c5d-a9fd-5abc-84a4-0d2e820a9f6d {}
017fbdc5-b5ac-5aa0-bf8c-f0368abf3447 {}
021d834d-29b3-57a2-87d9-2bcab18c09b8 {}
028b6a36-b6d6-5651-b78d-5ab66037b659 {}
029b38b6-cc2f-5da2-a2d3-2cb5b2d8ef57 {}
03f5b553-4577-5c5a-b377-e256555a7fc3 {}
04c96616-7b3f-50d5-9c8b-54c45282cd10 {}
05b67c39-07b5-59b2-9c0b-37c628a4015d {}
065a34fa-e27d-5d61-8b22-5d17192b6a2b {}
06605842-1709-5e9d-8a4b-314a96ff11a1 {}
06898f00-2f9d-569e-9caa-00c412c00e67 {}
072ae238-c1cf-5ba7-8c43-3917c6a0e3fa {}
08946032-1811-5f87-8096-ea09960c3d2c {}
094b7716-ad36-5a88-8596-3d7a0928acfd {}
096f5ee0-18d3-5a34-8a3a-397182e70935 {}
0a6b0791-4171-50a1-929e-3faf63545c1b {}
0a873a1c-b4de-5e6c-be6e-87f9afe1e1fc {}
0a9ac45f-f876-5dc9-9434-79bf8f1f4a77 {}
0b205c75-f335-597f-9e3e-012759b3e898 {}
0bf21660-86c4-5a03-b397-0a8674fc75f8 {}
0cccdbf6-547a-5e32-8927-d9343f305213 {}
0e03e883-20b0-5846-af3a-3a498a9d2b67 {}
0e80f98c-4772-524d-bb93-5fa5e2e1844a {}


In [37]:
??client.collections.get()

Object `client.collections.get()` not found.


# Intake without batching
- Insert articles
- Insert paragraphs
- Insert cross references

In [17]:
manager = SourceManager()

In [18]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()
    fresh_start(client)
    data = manager.load_json("processed_articles.jsonl")
    insert_without_links(client, data)
    

No sections: Cephandrius
No sections: Surgebinder
No sections: Shadesmar


## Insert references to existing pages

In [19]:
# data = manager.load_json("processed_articles.jsonl")
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    articles = client.collections.get("Article")
    response = articles.query.fetch_objects()
    master_articles = {}
    for o in response.objects:
        master_articles.update({o.properties["title"]: o.uuid})
    for art in data:
        art1_uuid = master_articles[art['title']]
        for link in art["links"]:
            art2_uuid = master_articles[link]
            articles.data.reference_add(
                from_uuid=art1_uuid,
                from_property="linksToArticles",
                to=art2_uuid
                )

In [20]:

with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    articles = client.collections.get("Article")
    response = articles.query.fetch_objects(
        return_references=[
            QueryReference(link_on="linksToArticles", return_properties="title")])

    for o in response.objects[:3]:
        if "linksToArticles" in o.references:
            for refs in o.references["linksToArticles"].objects:
                print(f"{o.properties["title"]}: {refs.properties["title"]}")

Tanavast: Cosmere
Tanavast: Odium
Tanavast: Stormfather
Tanavast: Dalinar Kholin
Tanavast: Hoid
Urithiru: Cosmere
Urithiru: Odium
Urithiru: Dalinar Kholin
Urithiru: Kaladin
Urithiru: Surgebinder
Urithiru: Shadesmar
Dalinar Kholin: Stormfather
Dalinar Kholin: Surgebinder
Dalinar Kholin: Shadesmar
Dalinar Kholin: Odium
Dalinar Kholin: Honor's Perpendicularity
Dalinar Kholin: Kaladin
Dalinar Kholin: Hoid
Dalinar Kholin: Urithiru
Dalinar Kholin: Fused
Dalinar Kholin: Cosmere
Dalinar Kholin: Knights Radiant


# Querying
- Vector
- Hybrid (vector/keyword)

In [21]:
from weaviate.classes.query import MetadataQuery

query = "iron"

In [22]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Paragraph")
    response = col_paragraphs.query.hybrid(
        query=query, 
        return_metadata=MetadataQuery(score=True, explain_score=True),
        limit = 10)

WeaviateQueryError: Query call with protocol GRPC search failed with message VectorFromInput was called without vectorizer on class Paragraph for input iron.

In [23]:
for o in response.objects:
    print(o.properties['content'])
    print(o.metadata.score, o.metadata.explain_score)

KeyError: 'content'

In [75]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Paragraph")
    alphas = [0, 0.1, 0.25, 0.5, .75, 0.9, 1]
    results = []
    for alpha in alphas:
        response = col_paragraphs.query.hybrid(
            query=query, 
            vector=query_vector,
            alpha=alpha,
            target_vector="content",
            return_metadata=MetadataQuery(score=True, explain_score=True),
            limit = 10)
        if len(response.objects) > 0:
            o = response.objects[0]
            results.append({"alpha": alpha,
                            "score": o.metadata.score,
                            "content": o.properties["content"]})
for res in results:
    print(res, "\n")

{'alpha': 0, 'score': 1.0, 'content': 'Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.'} 

{'alpha': 0.1, 'score': 0.8999999761581421, 'content': 'Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.'} 

{'alpha': 0.25, 'score': 0.75, 'content': 'Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.'} 

{'alpha': 0.5, 'score': 0.5, 'content': 'Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.'} 

{'alpha': 0.75, 'score': 0.25, 'content': 'Awakened objects see simil

In [76]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Article")
    alphas = [0, 0.1, 0.25, 0.5, .75, 0.9, 1]
    results = []
    for alpha in alphas:
        response = col_paragraphs.query.hybrid(
            query=query, 
            vector=query_vector,
            alpha=alpha,
            # target_vector="content",
            return_metadata=MetadataQuery(score=True, explain_score=True),
            limit = 10)
        if len(response.objects) > 0:
            o = response.objects[0]
            results.append({"alpha": alpha,
                            "score": o.metadata.score,
                            # "content": o.properties["content"],
                            "content": o.properties["title"]
                            })
for res in results:
    print(res, "\n")

{'alpha': 0.1, 'score': 0.10000000149011612, 'content': 'Fused'} 

{'alpha': 0.25, 'score': 0.25, 'content': 'Fused'} 

{'alpha': 0.5, 'score': 0.5, 'content': 'Fused'} 

{'alpha': 0.75, 'score': 0.75, 'content': 'Fused'} 

{'alpha': 0.9, 'score': 0.8999999761581421, 'content': 'Fused'} 

{'alpha': 1, 'score': 1.0, 'content': 'Fused'} 



In [77]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Article")
    results = []
    response = col_paragraphs.query.near_vector(
        near_vector=query_vector,
        # target_vector="content",
        return_metadata=MetadataQuery(score=True, explain_score=True),
        limit = 10)
    if len(response.objects) > 0:
        o = response.objects[0]
        results.append({"score": o.metadata.score,
                        # "content": o.properties["content"],
                        "content": o.properties["title"]
                        })
for res in results:
    print(res, "\n")

{'score': 0.0, 'content': 'Fused'} 



In [78]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Paragraph")
    results = []
    response = col_paragraphs.query.near_vector(
        near_vector=query_vector,
        # target_vector="content",
        return_metadata=MetadataQuery(score=True, explain_score=True),
        limit = 10)
    if len(response.objects) > 0:
        o = response.objects[0]
        results.append({
                        "score": o.metadata.score,
                        "content": o.properties["content"],
                        # "content": o.properties["title"]
                        })
for res in results:
    print(res, "\n")