In [29]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure, ReferenceProperty
from weaviate.util import generate_uuid5
from sentence_transformers import SentenceTransformer
from weaviate.classes.query import QueryReference
import json
from sys import path
import os
from sys import path
path.append(r'C:\Users\Izogie\Desktop\Folders\Projects\Python\KB Chat\src')
from modules.SourceManager import SourceManager

In [2]:
schema = {
    "classes":[
        {
            "class": "Article",
            "description": "A Coppermind article with a title and crefs",
            "vectorizer": "none",
            "vectorIndexConfig": {
                "skip": True
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "Title of the article",
                    "name": "title",
                    "indexInverted": True
                },
                {
                    "dataType": ["Paragraph"],
                    "description": "List of paragraphs form the article",
                    "name": "paragraphs",
                    "indexInverted": True
                },
                {
                    "dataType": ["Article"],
                    "description": "Cross-references from the article",
                    "name": "links",
                    "indexInverted": True
                },
            ]
        },
        {
            "class": "Paragraph",
            "description": "a paragraph with a header and and parent Article",
            "vectorizer": "text2vec-palm",
            "vectorIndexConfig": {
                "vectorCacheMaxObjects": 150000000000,
                "ef": 256,
                "efConstruction": 512,
                "maxConnections": 128
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "Title/header of the pargraph",
                    "name": "title",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-transformers": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["text"],
                    "description": "paragraph content",
                    "name": "content",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["int"],
                    "description": "Order of the paragraph",
                    "name": "order",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["Article"],
                    "description": "Article this paragraph is in",
                    "name": "inArticle",
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                }
            ]
        }
    ]
}

In [15]:
def create_weaviate_schema(client):
    # Delete all existing classes (optional, for a fresh start)
    client.collections.delete_all()

    schema_paragraph = {
                "class": "Paragraph",
                "description": "a paragraph with a header and and parent Article",
                "vectorizer": "none",
                "moduleConfig": {
                    "text2vec-huggingface": {
                        "model": "sentence-transformers/all-MiniLM-L6-v2",
                        "options": {
                            "waitForModel": True,
                            "useGPU": True,
                            "useCache": True
                            },
                    },

                },
                "properties": [
                    {
                        "dataType": ["text"],
                        "description": "Title/header of the pargraph",
                        "name": "title",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-huggingface": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["text"],
                        "description": "paragraph content",
                        "name": "content",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-huggingface": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["int"],
                        "description": "Order of the paragraph",
                        "name": "order",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-palm": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                ]
            }
    schema_article = {
                "class": "Article",
                "description": "A Coppermind article with a title and references",
                "vectorizer": "none",
                "moduleConfig": {
                    "text2vec-huggingface": {
                        "model": "sentence-transformers/all-MiniLM-L6-v2",
                        "options": {
                            "waitForModel": True,
                            "useGPU": True,
                            "useCache": True
                            },
                    },
                },
                "properties": [
                    {
                        "name": "title",
                        "description": "Title of the article",
                        "dataType": ["string"],
                        "indexInverted": True,
                        "moduleConfig":{"text2vec-huggingface":{"skip": True}}
                    },
                    {
                        "name": "hasParagraphs",
                        "description": "List of paragraphs from the article",
                        "dataType": ["Paragraph"],
                    },
                    {
                        "name": "linksToArticles",
                        "description": "Cross-references from the article",
                        "dataType": ["Article"],
                    }
                ]
            }
    for cls in [schema_paragraph,schema_article]:
        client.collections.create_from_dict(cls)
    
    paragraphs = client.collections.get("Paragraph")
    paragraphs.config.add_reference(
        ReferenceProperty(
            name="inArticle",
            target_collection="Article"
        )
    )
    # prop_para_parent = {
    #     "dataType": ["Article"],
    #     "description": "Article this paragraph is in",
    #     "name": "parent",
    #     "moduleConfig": {
    #         "text2vec-palm": {
    #             "skip": True,
    #             "vectorizePropertyName": False,
    #         }
    #     }
    # }
    # paragraphs

In [4]:
def show_collection_refs(client, collection_name, ref_name, return_proprties):
    query_col = client.collections.get(collection_name)
    response = query_col.query.fetch_objects(
        return_references=[
            QueryReference(link_on=ref_name, return_properties=return_proprties)])

    for o in response.objects:
        print(o.properties["title"])
        if ref_name in o.references:
            for ref_obj in o.references[ref_name].objects:
                print(ref_obj.properties)
        else: continue

In [5]:
def fresh_start(client):
    client.collections.delete_all()
    create_weaviate_schema(client)

In [6]:
def insert_without_links(client, articles):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in articles:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        vector = model.encode(art["title"])
        col_articles.data.insert(uuid=art_uuid, properties=art_properties, vector=vector)
            
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections']:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                title_vector = model.encode(para_properties['title'])
                content_vector = model.encode(para_properties['content'])
                col_paragraphs.data.insert(
                    uuid=para_uuid, 
                    properties=para_properties, 
                    vector={
                        'title': title_vector,
                        'content': content_vector
                    })
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )



# Initial tests

In [66]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    fresh_start(client)
    tsts = client.collections.get("Paragraph")
    print(tsts)

<weaviate.Collection config={
  "name": "Paragraph",
  "description": "a paragraph with a header and and parent Article",
  "generative_config": null,
  "inverted_index_config": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanup_interval_seconds": 60,
    "index_null_state": false,
    "index_property_length": false,
    "index_timestamps": false,
    "stopwords": {
      "preset": "en",
      "additions": null,
      "removals": null
    }
  },
  "multi_tenancy_config": {
    "enabled": false,
    "auto_tenant_creation": false,
    "auto_tenant_activation": false
  },
  "properties": [
    {
      "name": "title",
      "description": "Title/header of the pargraph",
      "data_type": "text",
      "index_filterable": true,
      "index_searchable": true,
      "nested_properties": null,
      "tokenization": "word",
      "vectorizer_config": {
        "skip": false,
        "vectorize_property_name": false
      },
      "vectorizer": "text2vec-palm"
    },
    {
 

In [2]:
manager = SourceManager()
articles = manager.load_json("articles.jsonl")

In [3]:
page_titles = [
    "Kaladin",
    "Pits of Hathsin",
    "Allomancy",
    "Cosmere",
    "Honor's Perpendicularity",
    "Cephandrius",
    "Hoid"
]
manager._init_mwclient()
data = manager.prep_data_graph(page_titles)
manager.save_json(data)

MWClient Connected with coppermind.net
MWClient unable to connect with coppermind.net
Processing Kaladin
Processing Pits of Hathsin
Processing Allomancy
Processing Cosmere
Processing Honor's Perpendicularity
Processing Cephandrius
Processing Hoid
articles.jsonl saved
articles.jsonl saved


In [92]:
data[-1]

{'title': 'Hoid',
 'links': ['Sons of Honor',
  'Knights Radiant',
  'Awakening',
  'Hallandren',
  'sa2-67',
  'tress-epilogue',
  'kandra',
  'Rosharan system',
  'Tears of Edgli',
  'Darkside',
  'Elantris (book)',
  'Bavadin',
  'Allomancy',
  'Wandersail',
  'Shallan Davar',
  'Cognitive Realm',
  "Cultivation's Perpendicularity",
  'Elantrian',
  'Khriss',
  'wob_ref-9218',
  'Tanavast',
  'Yomen-Ostlin wedding dinner',
  '#Known Aliases',
  'wob_ref-2101',
  'Ishikk',
  'Waxillium Ladrian',
  'Mistborn',
  'sa1-epilogue',
  'wob_ref-3759',
  'wob_ref-15271',
  'Vasher',
  'wob_ref-2567',
  'Inquisitor',
  'wob_ref-1523',
  'wob_ref-656',
  'lighteyes',
  'Forger',
  'wob_ref-7135',
  'Cryptic',
  'Nightwatcher',
  'skaze',
  'Yumi-6',
  'wob_ref-3546',
  'sa1-57',
  'wob_ref-14988',
  'wob_ref-3212',
  'mb7-21',
  'tress-15',
  'wob_ref-5246',
  'Rayse',
  'tress-63',
  'Court of the Gods',
  'Obrodai',
  'wob_ref-16289',
  'New Seran',
  'Wan ShaiLu',
  'wob:9518',
  'wob_ref-1

In [81]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    fresh_start(client) # delete old entries. Start fresh
    
    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in articles:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        vector = model.encode(art["title"])
        col_articles.data.insert(uuid=art_uuid, properties=art_properties, vector=vector)
        
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections']:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                title_vector = model.encode(para_properties['title'])
                content_vector = model.encode(para_properties['content'])
                col_paragraphs.data.insert(
                    uuid=para_uuid, 
                    properties=para_properties, 
                    vector={
                        'title': title_vector,
                        'content': content_vector
                    })
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )



No sections: Cephandrius


In [88]:
from weaviate.classes.query import QueryReference
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    # show_collection_refs(client, "Article", "hasParagraphs", "title")
    collection_name = "Article"
    ref_name = "hasParagraphs"
    return_proprties ="title"

Honor's Perpendicularity
{'title': "Honor's Perpendicularity"}
{'title': "Honor's Perpendicularity"}
{'title': 'At Thaylen Field'}
{'title': 'At Thaylen Field'}
{'title': 'At Thaylen Field'}
{'title': 'Trivia'}
Allomancy
{'title': 'Allomancy'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Gaining Allomantic Abilities'}
{'title': 'Gaining Allomantic Abilities'}
{'title': 'Gaining Allomantic Abilities'}
{'title': 'Snapping'}
{'title': 'Snapping'}
{'title': 'Snapping'}
{'title': 'Snapping'}
{'title': 'Mist Sickness'}
{'title': 'Mist Sickness'}
{'title': 'Mist Sickness'}
{'title': 'Savantism'}
{'title': 'Types of Allomancers'}
{'title': 'Types of Allomancers'}
{'title': 'Twinborn and Compounders'}
{'title': 'Twinborn and Compounders'}
{'title': 'Allomantic Metals'}
{'title': 'Allomantic Metals'}
{'title': 'Allomantic Metals'}
{'title': 'Allomantic Metals'}
{'title': 'God Metals'}
{'title'

In [42]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    collection = client.collections.get("Paragraph")
    for item in collection.iterator():
        print(item.uuid, item.properties)

# Intake without batching
- Insert articles
- Insert paragraphs
- Insert cross references

In [7]:
manager = SourceManager()

In [16]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()
    fresh_start(client)
    data = manager.load_json("processed_articles.jsonl")
    insert_without_links(client, data)
    

No sections: Cephandrius
No sections: Surgebinder
No sections: Shadesmar


In [None]:
def insert_without_links(client, articles):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in articles:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        vector = model.encode(art["title"])
        col_articles.data.insert(uuid=art_uuid, properties=art_properties, vector=vector)
            
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections']:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                title_vector = model.encode(para_properties['title'])
                content_vector = model.encode(para_properties['content'])
                col_paragraphs.data.insert(
                    uuid=para_uuid, 
                    properties=para_properties, 
                    vector={
                        'title': title_vector,
                        'content': content_vector
                    })
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )
    


# Insert references to existing pages

In [26]:

data = manager.load_json("processed_articles.jsonl")

with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    articles = client.collections.get("Article")
    response = articles.query.fetch_objects()
    master_articles = {}
    for o in response.objects:
        # print(o.properties, o.uuid)
        master_articles.update({o.properties["title"]: o.uuid})
        # print(merged)
        # break
    for art in data:
        art1_uuid = master_articles[art['title']]
        for link in art["links"]:
            art2_uuid = master_articles[link]
            articles.data.reference_add(
                from_uuid=art1_uuid,
                from_property="linksToArticles",
                to=art2_uuid
                )

In [34]:

with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    articles = client.collections.get("Article")
    response = articles.query.fetch_objects(
        return_references=[
            QueryReference(link_on="linksToArticles", return_properties="title")])

    for o in response.objects:
        if "linksToArticles" in o.references:
            for refs in o.references["linksToArticles"].objects:
                print(f"{o.properties["title"]}: {refs.properties["title"]}")

Tanavast: Cosmere
Tanavast: Odium
Tanavast: Stormfather
Tanavast: Dalinar Kholin
Tanavast: Hoid
Urithiru: Cosmere
Urithiru: Odium
Urithiru: Dalinar Kholin
Urithiru: Kaladin
Urithiru: Surgebinder
Urithiru: Shadesmar
Dalinar Kholin: Stormfather
Dalinar Kholin: Surgebinder
Dalinar Kholin: Shadesmar
Dalinar Kholin: Odium
Dalinar Kholin: Honor's Perpendicularity
Dalinar Kholin: Kaladin
Dalinar Kholin: Hoid
Dalinar Kholin: Urithiru
Dalinar Kholin: Fused
Dalinar Kholin: Cosmere
Dalinar Kholin: Knights Radiant
Honor's Perpendicularity: Odium
Honor's Perpendicularity: Stormfather
Honor's Perpendicularity: Shadesmar
Honor's Perpendicularity: Surgebinder
Honor's Perpendicularity: Fused
Allomancy: Odium
Allomancy: Kelsier
Allomancy: Realmatic Theory
Allomancy: Hoid
Allomancy: Knights Radiant
Cephandrius: Hoid
Cognitive Shadow: Cosmere
Cognitive Shadow: Odium
Cognitive Shadow: Stormfather
Cognitive Shadow: Kelsier
Cognitive Shadow: Allomancy
Cognitive Shadow: Fused
Cognitive Shadow: Tanavast
Pits o