In [85]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure, ReferenceProperty
from weaviate.util import generate_uuid5
from sentence_transformers import SentenceTransformer
from weaviate.classes.query import QueryReference
from yaml import safe_load
import json
from sys import path
import os
path.append(r'C:\Users\Izogie\Desktop\Folders\Projects\Python\KB Chat\src')
from modules.SourceManager import SourceManager

In [86]:
# Load API key from config.yaml (`GOOGLE_API_KEY: <YOUR API TOKEN>`) into environment variable.
# Kept in a seperate file for ease of access across projects.
def load_config(file_path="./config.yml"):
    with open(file_path, 'r') as file:
        config = safe_load(file)
        for key, value in config.items():
            os.environ[key] = value
load_config()

In [2]:
schema = {
    "classes":[
        {
            "class": "Article",
            "description": "A Coppermind article with a title and crefs",
            "vectorizer": "none",
            "vectorIndexConfig": {
                "skip": True
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "Title of the article",
                    "name": "title",
                    "indexInverted": True
                },
                {
                    "dataType": ["Paragraph"],
                    "description": "List of paragraphs form the article",
                    "name": "paragraphs",
                    "indexInverted": True
                },
                {
                    "dataType": ["Article"],
                    "description": "Cross-references from the article",
                    "name": "links",
                    "indexInverted": True
                },
            ]
        },
        {
            "class": "Paragraph",
            "description": "a paragraph with a header and and parent Article",
            "vectorizer": "text2vec-palm",
            "vectorIndexConfig": {
                "vectorCacheMaxObjects": 150000000000,
                "ef": 256,
                "efConstruction": 512,
                "maxConnections": 128
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "Title/header of the pargraph",
                    "name": "title",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-transformers": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["text"],
                    "description": "paragraph content",
                    "name": "content",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["int"],
                    "description": "Order of the paragraph",
                    "name": "order",
                    "indexInverted": True,
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                },
                {
                    "dataType": ["Article"],
                    "description": "Article this paragraph is in",
                    "name": "inArticle",
                    "moduleConfig": {
                        "text2vec-palm": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    }
                }
            ]
        }
    ]
}

In [96]:
def create_weaviate_schema(client):
    # Delete all existing classes (optional, for a fresh start)
    client.collections.delete_all()

    schema_paragraph = {
                "class": "Paragraph",
                "description": "a paragraph with a header and and parent Article",
                "vectorizer": "none",
                "moduleConfig": {
                    "text2vec-huggingface": {
                        "model": "sentence-transformers/all-MiniLM-L6-v2",
                        "options": {
                            "waitForModel": True,
                            "useGPU": True,
                            "useCache": True
                            },
                    },

                },
                "properties": [
                    {
                        "dataType": ["text"],
                        "description": "Title/header of the pargraph",
                        "name": "title",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-huggingface": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["text"],
                        "description": "paragraph content",
                        "name": "content",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-huggingface": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                    {
                        "dataType": ["int"],
                        "description": "Order of the paragraph",
                        "name": "order",
                        "indexInverted": True,
                        "moduleConfig": {
                            "text2vec-palm": {
                                "skip": True,
                                "vectorizePropertyName": False,
                            }
                        }
                    },
                ]
            }
    schema_article = {
                "class": "Article",
                "description": "A Coppermind article with a title and references",
                "vectorizer": "none",
                "moduleConfig": {
                    "text2vec-huggingface": {
                        "model": "sentence-transformers/all-MiniLM-L6-v2",
                        "options": {
                            "waitForModel": True,
                            "useGPU": True,
                            "useCache": True
                            },
                    },
                },
                "properties": [
                    {
                        "name": "title",
                        "description": "Title of the article",
                        "dataType": ["string"],
                        "indexInverted": True,
                        "moduleConfig":{"text2vec-huggingface":{"skip": True}}
                    },
                    {
                        "name": "hasParagraphs",
                        "description": "List of paragraphs from the article",
                        "dataType": ["Paragraph"],
                    },
                    {
                        "name": "linksToArticles",
                        "description": "Cross-references from the article",
                        "dataType": ["Article"],
                    }
                ]
            }
    for cls in [schema_paragraph,schema_article]:
        client.collections.create_from_dict(cls)
    
    paragraphs = client.collections.get("Paragraph")
    paragraphs.config.add_reference(
        ReferenceProperty(
            name="inArticle",
            target_collection="Article"
        )
    )
    # prop_para_parent = {
    #     "dataType": ["Article"],
    #     "description": "Article this paragraph is in",
    #     "name": "parent",
    #     "moduleConfig": {
    #         "text2vec-palm": {
    #             "skip": True,
    #             "vectorizePropertyName": False,
    #         }
    #     }
    # }
    # paragraphs

In [4]:
def show_collection_refs(client, collection_name, ref_name, return_proprties):
    query_col = client.collections.get(collection_name)
    response = query_col.query.fetch_objects(
        return_references=[
            QueryReference(link_on=ref_name, return_properties=return_proprties)])

    for o in response.objects:
        print(o.properties["title"])
        if ref_name in o.references:
            for ref_obj in o.references[ref_name].objects:
                print(ref_obj.properties)
        else: continue

In [95]:
def fresh_start(client):
    client.collections.delete_all()
    create_weaviate_schema(client)

In [6]:
def insert_without_links(client, articles):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in articles:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        vector = model.encode(art["title"])
        col_articles.data.insert(uuid=art_uuid, properties=art_properties, vector=vector)
            
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections']:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                title_vector = model.encode(para_properties['title'])
                content_vector = model.encode(para_properties['content'])
                col_paragraphs.data.insert(
                    uuid=para_uuid, 
                    properties=para_properties, 
                    vector={
                        'title': title_vector,
                        'content': content_vector
                    })
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )



# Initial tests

In [98]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    fresh_start(client, True)
    tsts = client.collections.get("Paragraph")
    print(tsts)

<weaviate.Collection config={
  "name": "Paragraph",
  "description": "a paragraph with a header and and parent Article",
  "generative_config": null,
  "inverted_index_config": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanup_interval_seconds": 60,
    "index_null_state": false,
    "index_property_length": false,
    "index_timestamps": false,
    "stopwords": {
      "preset": "en",
      "additions": null,
      "removals": null
    }
  },
  "multi_tenancy_config": {
    "enabled": false,
    "auto_tenant_creation": false,
    "auto_tenant_activation": false
  },
  "properties": [
    {
      "name": "title",
      "description": "Title/header of the pargraph",
      "data_type": "text",
      "index_filterable": true,
      "index_searchable": true,
      "nested_properties": null,
      "tokenization": "word",
      "vectorizer_config": null,
      "vectorizer": "none"
    },
    {
      "name": "content",
      "description": "paragraph content",
      "data_

In [100]:
manager = SourceManager()
data = manager.load_json("articles.jsonl")

In [3]:
page_titles = [
    "Kaladin",
    "Pits of Hathsin",
    "Allomancy",
    "Cosmere",
    "Honor's Perpendicularity",
    "Cephandrius",
    "Hoid"
]
manager._init_mwclient()
data = manager.prep_data_graph(page_titles)
manager.save_json(data)

MWClient Connected with coppermind.net
MWClient unable to connect with coppermind.net
Processing Kaladin
Processing Pits of Hathsin
Processing Allomancy
Processing Cosmere
Processing Honor's Perpendicularity
Processing Cephandrius
Processing Hoid
articles.jsonl saved
articles.jsonl saved


In [101]:
data[-1]

{'title': 'Fused',
 'links': ['Cosmere',
  'ref-book-sa4-113',
  'ref-book-sa4-104',
  'Braize',
  'Stormfather',
  'category: Fused',
  'Abaray',
  'aluminum',
  'ref-book-sa4-37',
  'ref-book-sa4-89',
  'ref-book-sa4-97',
  'ref-book-sa4-56',
  'Dawnchant',
  'Herald',
  'Hnanan',
  'ref-wob-14918',
  'thunderclast',
  'Kyril',
  'ref-book-sa4-38',
  'Regals',
  'Cognitive Shadow',
  'ref-book-sa4-2',
  'listeners',
  'Surge#Adhesion',
  'Surge#Transportation',
  'ref-book-sa3-79',
  'Song of Prayer',
  'The Five',
  'ref-book-sa3-43',
  'ref-wob-9412',
  'singer',
  'True Desolation',
  'ref-book-sa4-84',
  'ref-wob-10440',
  'ref-book-sa4-i-4',
  'ref-wob-13221',
  'ref-book-sa4-60',
  'Order of Windrunners',
  'ref-book-sa4-59',
  'Rine',
  'Everstorm',
  'Oathpact',
  'ref-book-sa4-14',
  'lifeform\n',
  'Soulcasting',
  'Order of Lightweavers',
  'Lezian',
  'Order of Edgedancers',
  'ref-book-sa3-i-13',
  'Hariel',
  'Surge#Illumination',
  'Surge#Cohesion',
  'El',
  'raysium'

In [81]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    fresh_start(client) # delete old entries. Start fresh
    
    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in articles:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        vector = model.encode(art["title"])
        col_articles.data.insert(uuid=art_uuid, properties=art_properties, vector=vector)
        
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections']:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                title_vector = model.encode(para_properties['title'])
                content_vector = model.encode(para_properties['content'])
                col_paragraphs.data.insert(
                    uuid=para_uuid, 
                    properties=para_properties, 
                    vector={
                        'title': title_vector,
                        'content': content_vector
                    })
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )



No sections: Cephandrius


In [88]:
from weaviate.classes.query import QueryReference
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    # show_collection_refs(client, "Article", "hasParagraphs", "title")
    collection_name = "Article"
    ref_name = "hasParagraphs"
    return_proprties ="title"

Honor's Perpendicularity
{'title': "Honor's Perpendicularity"}
{'title': "Honor's Perpendicularity"}
{'title': 'At Thaylen Field'}
{'title': 'At Thaylen Field'}
{'title': 'At Thaylen Field'}
{'title': 'Trivia'}
Allomancy
{'title': 'Allomancy'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Mechanics'}
{'title': 'Gaining Allomantic Abilities'}
{'title': 'Gaining Allomantic Abilities'}
{'title': 'Gaining Allomantic Abilities'}
{'title': 'Snapping'}
{'title': 'Snapping'}
{'title': 'Snapping'}
{'title': 'Snapping'}
{'title': 'Mist Sickness'}
{'title': 'Mist Sickness'}
{'title': 'Mist Sickness'}
{'title': 'Savantism'}
{'title': 'Types of Allomancers'}
{'title': 'Types of Allomancers'}
{'title': 'Twinborn and Compounders'}
{'title': 'Twinborn and Compounders'}
{'title': 'Allomantic Metals'}
{'title': 'Allomantic Metals'}
{'title': 'Allomantic Metals'}
{'title': 'Allomantic Metals'}
{'title': 'God Metals'}
{'title'

In [42]:
with weaviate.connect_to_local(headers=header) as client:
    if not client.is_connected(): client.connect()
    collection = client.collections.get("Paragraph")
    for item in collection.iterator():
        print(item.uuid, item.properties)

# Intake without batching
- Insert articles
- Insert paragraphs
- Insert cross references

In [7]:
manager = SourceManager()

In [16]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()
    fresh_start(client)
    data = manager.load_json("processed_articles.jsonl")
    insert_without_links(client, data)
    

No sections: Cephandrius
No sections: Surgebinder
No sections: Shadesmar


In [None]:
def insert_without_links(client, articles):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    col_articles = client.collections.get("Article")
    col_paragraphs = client.collections.get("Paragraph")
    # Add without cross refs first
    for art in articles:
        art_properties = {
            "title": art["title"],
        }
        art_uuid = generate_uuid5(art_properties)
        vector = model.encode(art["title"])
        col_articles.data.insert(uuid=art_uuid, properties=art_properties, vector=vector)
            
        # print(col_articles.data.insert(properties=art_properties))
        if art['sections'] is None:
            print(f"No sections: {art['title']}")
        else:
            for para in art['sections']:
                para_properties={
                    "title": para["title"],
                    "content": para["content"],
                    "order": para["order"],
                }
                para_uuid = generate_uuid5(para_properties)
                title_vector = model.encode(para_properties['title'])
                content_vector = model.encode(para_properties['content'])
                col_paragraphs.data.insert(
                    uuid=para_uuid, 
                    properties=para_properties, 
                    vector={
                        'title': title_vector,
                        'content': content_vector
                    })
                # make two way reference
                col_paragraphs.data.reference_add(
                    from_uuid=para_uuid,
                    from_property="inArticle",
                    to=art_uuid
                    )
                col_articles.data.reference_add(
                    from_uuid=art_uuid,
                    from_property="hasParagraphs",
                    to=para_uuid
                    )
    


## Insert references to existing pages

In [26]:

data = manager.load_json("processed_articles.jsonl")

with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    articles = client.collections.get("Article")
    response = articles.query.fetch_objects()
    master_articles = {}
    for o in response.objects:
        # print(o.properties, o.uuid)
        master_articles.update({o.properties["title"]: o.uuid})
        # print(merged)
        # break
    for art in data:
        art1_uuid = master_articles[art['title']]
        for link in art["links"]:
            art2_uuid = master_articles[link]
            articles.data.reference_add(
                from_uuid=art1_uuid,
                from_property="linksToArticles",
                to=art2_uuid
                )

In [106]:

with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    articles = client.collections.get("Article")
    response = articles.query.fetch_objects(
        return_references=[
            QueryReference(link_on="linksToArticles", return_properties="title")])

    for o in response.objects:
        if "linksToArticles" in o.references:
            for refs in o.references["linksToArticles"].objects:
                print(f"{o.properties["title"]}: {refs.properties["title"]}")

WeaviateConnectionError: Connection to Weaviate failed. Error: [WinError 10061] No connection could be made because the target machine actively refused it. 
Is Weaviate running and reachable at http://localhost:8080?

# Querying
- Vector
- Hybrid (vector/keyword)

In [105]:
from weaviate.classes.query import MetadataQuery

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
query = "what does steel do in allomancy?"
query_vector = model.encode(query).tolist()

In [104]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Paragraph")
    response = col_paragraphs.query.hybrid(
        query=query, 
        vector=query_vector,
        return_metadata=MetadataQuery(score=True, explain_score=True),
        limit = 10)

WeaviateQueryError: Query call with protocol GRPC search failed with message End of TCP stream.

In [83]:
for o in response.objects:
    print(o.properties['content'])
    print(o.metadata.score, o.metadata.explain_score)

Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.
0.30000001192092896 
Hybrid (Result Set keyword,bm25) Document b1ec9c20-6b12-536b-aabc-1a0bb158c856: original score 2.4841118, normalized score: 0.3
The Allomantic effects of metals are most similar to the effects of fabrials on Roshar. Steel and Iron also have corresponding effects in how they affect Aether, whether other allomantic metals also have effects or not is currently unknown.
0.26546016335487366 
Hybrid (Result Set keyword,bm25) Document b153a6ff-6454-55c9-a5f5-59088cc8ef15: original score 2.3114135, normalized score: 0.26546016
After the father machine was destroyed by Yumi, and the attempts to absorb his Investiture thus halted, he was freed from his stasis. He planned with Design to disguise themselves as astronauts and steal a ship, allowing them to travel to the nearby Iron Seven Waystatio

In [75]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Paragraph")
    alphas = [0, 0.1, 0.25, 0.5, .75, 0.9, 1]
    results = []
    for alpha in alphas:
        response = col_paragraphs.query.hybrid(
            query=query, 
            vector=query_vector,
            alpha=alpha,
            target_vector="content",
            return_metadata=MetadataQuery(score=True, explain_score=True),
            limit = 10)
        if len(response.objects) > 0:
            o = response.objects[0]
            results.append({"alpha": alpha,
                            "score": o.metadata.score,
                            "content": o.properties["content"]})
for res in results:
    print(res, "\n")

{'alpha': 0, 'score': 1.0, 'content': 'Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.'} 

{'alpha': 0.1, 'score': 0.8999999761581421, 'content': 'Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.'} 

{'alpha': 0.25, 'score': 0.75, 'content': 'Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.'} 

{'alpha': 0.5, 'score': 0.5, 'content': 'Awakened objects see similar to Steel Inquisitors do via Iron and Steel lines. Awakened objects can identify the Intent of people, and can choose not to perform actions based on that.'} 

{'alpha': 0.75, 'score': 0.25, 'content': 'Awakened objects see simil

In [76]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Article")
    alphas = [0, 0.1, 0.25, 0.5, .75, 0.9, 1]
    results = []
    for alpha in alphas:
        response = col_paragraphs.query.hybrid(
            query=query, 
            vector=query_vector,
            alpha=alpha,
            # target_vector="content",
            return_metadata=MetadataQuery(score=True, explain_score=True),
            limit = 10)
        if len(response.objects) > 0:
            o = response.objects[0]
            results.append({"alpha": alpha,
                            "score": o.metadata.score,
                            # "content": o.properties["content"],
                            "content": o.properties["title"]
                            })
for res in results:
    print(res, "\n")

{'alpha': 0.1, 'score': 0.10000000149011612, 'content': 'Fused'} 

{'alpha': 0.25, 'score': 0.25, 'content': 'Fused'} 

{'alpha': 0.5, 'score': 0.5, 'content': 'Fused'} 

{'alpha': 0.75, 'score': 0.75, 'content': 'Fused'} 

{'alpha': 0.9, 'score': 0.8999999761581421, 'content': 'Fused'} 

{'alpha': 1, 'score': 1.0, 'content': 'Fused'} 



In [77]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Article")
    results = []
    response = col_paragraphs.query.near_vector(
        near_vector=query_vector,
        # target_vector="content",
        return_metadata=MetadataQuery(score=True, explain_score=True),
        limit = 10)
    if len(response.objects) > 0:
        o = response.objects[0]
        results.append({"score": o.metadata.score,
                        # "content": o.properties["content"],
                        "content": o.properties["title"]
                        })
for res in results:
    print(res, "\n")

{'score': 0.0, 'content': 'Fused'} 



In [78]:
with weaviate.connect_to_local() as client:
    if not client.is_connected(): client.connect()   
    col_paragraphs = client.collections.get("Paragraph")
    results = []
    response = col_paragraphs.query.near_vector(
        near_vector=query_vector,
        # target_vector="content",
        return_metadata=MetadataQuery(score=True, explain_score=True),
        limit = 10)
    if len(response.objects) > 0:
        o = response.objects[0]
        results.append({
                        "score": o.metadata.score,
                        "content": o.properties["content"],
                        # "content": o.properties["title"]
                        })
for res in results:
    print(res, "\n")